In [1]:
import pandas as pd
import re
import os.path
import jieba
import jieba.analyse
from nltk import tokenize
from nltk.corpus import wordnet as wn


In [2]:
csv_dir = os.path.join("in_csv")
csv_path = os.path.join(csv_dir, 'tester.csv')

df = pd.read_csv(csv_path, encoding='utf-8')

pd.set_option('display.max_colwidth', None) #remove python output 30 char max

df.head(15)

Unnamed: 0,comment
0,have a nice day dataset
1,i am Harvard graduate
2,Pasukan Johor Darul Takzim (JDT) gagal memanfaatkan peluang beraksi di Stadium Iskandar Puteri sebentar tadi selepas seri 1-1 dengan Selangor FC.\nhttps://malaysiaharmoni.net/jdt-kelu-di-laman-sendiri-diikat-selangor-fc/
3,Siapla kau..xlama lagi xda jawatan xjadi ahli politik
4,Dia ni punca kpd keruntuhan parti dan kerajaan. Org mcm ni xlayak jadi pemimpin apalagi menteri.
5,diam la bohong je banyak ni kerja x jalan!!!tak payah bertanding la sapa nak undi kau!!!
6,I though what would happen is that white vision would merge with regular vision at the last second
7,If I could describe the finale with just one word
8,我来到北京清华大学
9,乒乓球拍卖完了.Sold out!!!


In [3]:
def cleanTXT(text):
    #all regex are tested using regex tester at https://regex101.com/
    text = re.sub(r'@[\w.]+', '',text) #remove @mentions + '_' + '.' : @yew_lee
    text = re.sub(r'#', '', text) #remove '#'
    text = re.sub(r'\\n', '', text) #remove '\n'
    text = re.sub(r'http\S+', '', text) #links
    text = re.sub(r'\W', ' ', text) #replace every symbol+emoji with empty space
    text = re.sub(r'_', ' ', text) #replace '_' since '\W' fail to catch underscore

    return re.compile(r'\s{2,}').sub(' ', text) #remove multiple whitespace to single whitespace

df['comment'] = df['comment'].apply(cleanTXT)

df.head(10)

Unnamed: 0,comment
0,have a nice day dataset
1,i am Harvard graduate
2,Pasukan Johor Darul Takzim JDT gagal memanfaatkan peluang beraksi di Stadium Iskandar Puteri sebentar tadi selepas seri 1 1 dengan Selangor FC
3,Siapla kau xlama lagi xda jawatan xjadi ahli politik
4,Dia ni punca kpd keruntuhan parti dan kerajaan Org mcm ni xlayak jadi pemimpin apalagi menteri
5,diam la bohong je banyak ni kerja x jalan tak payah bertanding la sapa nak undi kau
6,I though what would happen is that white vision would merge with regular vision at the last second
7,If I could describe the finale with just one word
8,我来到北京清华大学
9,乒乓球拍卖完了 Sold out


In [4]:
def check_lang_eng(token_list): #retrieve english token by comparing to synset
    eng_list=[]
    for token in token_list:
        if wn.synsets(token, lang='eng'):
            eng_list.append(token)
    return eng_list

def check_lang_eng_perc(token_list): # retrieve english token percentage in the sentence
    counts_1 = 0
    for token in token_list:
        if wn.synsets(token, lang='eng'):
            counts_1+=1
    if len(token_list)>0:
        percentage = counts_1/(len(token_list))
        return percentage
    else:
        return 0

def check_lang_zsm(token_list): #retrieve malay token by comparing to synset zsm
    zsm_list=[]
    for token in token_list:
        if wn.synsets(token, lang='zsm'):  #ind
            #no need to load .tab since Open Multilingual Wordnet has integrated it into nltk wn
            zsm_list.append(token)
    return zsm_list

def check_lang_zsm_perc(token_list): #retrieve malay token percentage in sentence
    counts_1 = 0
    for token in token_list:
        if wn.synsets(token, lang='zsm'): #ind
            counts_1+=1
    if len(token_list)>0:
        percentage = counts_1/(len(token_list))
        return percentage
    else:
        return 0
    
def check_lang_zho(token_list): #retrieve malay token
    token_list = re.sub(r'\W', '', token_list)
    token_list = list(jieba.cut(token_list, cut_all=False)) #Tokenize Chinese Word using jieba
    cn_list = []
    #check if is chinese
    for word in token_list:
        if re.search("[\u4e00-\u9FFF]", word): #search all chinese words using utf code
            cn_list.append(word)
    return cn_list

def check_lang_zho_perc(token_list):
    count = 0
    token_list = re.sub(r'\W', '', token_list)
    token_list = list(jieba.cut(token_list, cut_all=False)) #Tokenize Chinese Word
    #check if is chinese
    for word in token_list:
        if re.search("[\u4e00-\u9FFF]", word):
            count +=1
    if len(token_list)>0:
        percentage = count/(len(token_list))
        return percentage
    else:
        return 0

def whitespaceTokenizer(data): #whitespace tokenizer function
    token_list=[]
    token_list = tokenize.WhitespaceTokenizer().tokenize(data)
    return token_list
    

In [5]:
#tokenize
df['comments_tokens'] = df['comment'].apply(whitespaceTokenizer) 

#calculating
df['eng_tokens'] = df['comments_tokens'].apply(check_lang_eng)
df['eng_percentage'] = df['comments_tokens'].apply(check_lang_eng_perc)
df['zsm_tokens'] = df['comments_tokens'].apply(check_lang_zsm)
df['zsm_percentage'] = df['comments_tokens'].apply(check_lang_zsm_perc)
df['zho_tokens'] = df['comment'].apply(check_lang_zho)
df['zho_perc'] = df['comment'].apply(check_lang_zho_perc)


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.763 seconds.
Prefix dict has been built successfully.


In [6]:
df.head(20)

Unnamed: 0,comment,comments_tokens,eng_tokens,eng_percentage,zsm_tokens,zsm_percentage,zho_tokens,zho_perc
0,have a nice day dataset,"[have, a, nice, day, dataset]","[have, a, nice, day]",0.8,"[a, nice, day]",0.6,[],0.0
1,i am Harvard graduate,"[i, am, Harvard, graduate]","[i, am, Harvard, graduate]",1.0,"[am, Harvard]",0.5,[],0.0
2,Pasukan Johor Darul Takzim JDT gagal memanfaatkan peluang beraksi di Stadium Iskandar Puteri sebentar tadi selepas seri 1 1 dengan Selangor FC,"[Pasukan, Johor, Darul, Takzim, JDT, gagal, memanfaatkan, peluang, beraksi, di, Stadium, Iskandar, Puteri, sebentar, tadi, selepas, seri, 1, 1, dengan, Selangor, FC]","[Stadium, 1, 1]",0.136364,"[Pasukan, Takzim, gagal, memanfaatkan, peluang, beraksi, di, Stadium, Puteri, sebentar, tadi, selepas, seri, dengan]",0.636364,[],0.0
3,Siapla kau xlama lagi xda jawatan xjadi ahli politik,"[Siapla, kau, xlama, lagi, xda, jawatan, xjadi, ahli, politik]",[],0.0,"[lagi, jawatan, ahli, politik]",0.444444,[],0.0
4,Dia ni punca kpd keruntuhan parti dan kerajaan Org mcm ni xlayak jadi pemimpin apalagi menteri,"[Dia, ni, punca, kpd, keruntuhan, parti, dan, kerajaan, Org, mcm, ni, xlayak, jadi, pemimpin, apalagi, menteri]","[Dia, ni, ni]",0.1875,"[ni, punca, keruntuhan, parti, kerajaan, ni, jadi, pemimpin, menteri]",0.5625,[],0.0
5,diam la bohong je banyak ni kerja x jalan tak payah bertanding la sapa nak undi kau,"[diam, la, bohong, je, banyak, ni, kerja, x, jalan, tak, payah, bertanding, la, sapa, nak, undi, kau]","[diam, la, ni, x, la]",0.294118,"[diam, la, bohong, banyak, ni, kerja, jalan, tak, payah, bertanding, la, nak, undi]",0.764706,[],0.0
6,I though what would happen is that white vision would merge with regular vision at the last second,"[I, though, what, would, happen, is, that, white, vision, would, merge, with, regular, vision, at, the, last, second]","[I, though, happen, is, white, vision, merge, regular, vision, at, last, second]",0.666667,[white],0.055556,[],0.0
7,If I could describe the finale with just one word,"[If, I, could, describe, the, finale, with, just, one, word]","[I, describe, finale, just, one, word]",0.6,[finale],0.1,[],0.0
8,我来到北京清华大学,[我来到北京清华大学],[],0.0,[],0.0,"[我, 来到, 北京, 清华大学]",1.0
9,乒乓球拍卖完了 Sold out,"[乒乓球拍卖完了, Sold, out]","[Sold, out]",0.666667,[],0.0,"[乒乓球, 拍卖, 完, 了]",0.8


### TO DO NEXT:

> 1. Decide on logic for classifying language (ratio-based or singular-rule based)
- ratio : get the highest percentage of each sentence, highest will be that language
- singular: if certain percentange more than certain threshold, go that language 

> 2. Moving forward: remove noun and proper noun for more accuracy (stopwords, cant detect noun)

> 3. How to define "Rojak"