# Preprocessing
Hal yang dilakukan:
1. Common Preprocessing
    1. Remove ASCII and Unicode.
    2. Tokenization
    3. Case folding, convert into lowercase
    4. Repeated dot (sedih... -> sedih.)
    5. Repeated character ('hehe :)))' -> 'hehe :)')
    6. Remove elipsis (lanjut baca... -> lanjut baca)
    7. Repeated word that has meaning ('malam malam' -> 'malam-malam')
    8. Remove newline
2. Specific Preprocessing
    1. Special symbols on Twitter, removing hashtag, mention, RT, and FAV
    2. Remove all emoticons
    3. Remove URL
    4. Spell checker using noisy channel approach

In [1]:
import pandas as pd

In [2]:
df_tweets = pd.read_csv('export.csv')
df_tweets.shape

(236, 6)

In [3]:
df_tweets.head()

Unnamed: 0,user_id,username,created_at,latitude,longitude,text
0,267076700.0,Rosiwulan_,2018-03-28 00:24:35,-7.648,111.31925,At Sam Poo Kong Temple (Zheng He Temple) [pic]...
1,86853860.0,dodydokong,2018-03-28 00:21:39,-6.9963,110.398,Sam Poo Kong\n\nm Poo Kong (Chinese: 三保洞; piny...
2,2424832000.0,ATCSKotaSMG,2018-03-27 03:00:26,,,10.00 WIB Lalin Simp Sam Poo Kong terpantau ra...
3,209980100.0,deadheblonk,2018-03-26 18:02:33,-7.01141,110.386012,Just posted a photo @ Wisata Sam Poo Kong Sema...
4,2424832000.0,ATCSKotaSMG,2018-03-26 03:05:36,,,10.05 WIB SAM POO KONG\nLalin terpantau relati...


In [4]:
df_tweets['created_at'][0].split(' ')

['2018-03-28', '00:24:35']

In [5]:
from datetime import datetime, timedelta

datetime_object = datetime.strptime(df_tweets['created_at'][0], '%Y-%m-%d %H:%M:%S')
d = datetime.today() - timedelta(days=100, microseconds=0)
n = datetime.today()
nn = datetime.today()
print(str(d).split(' '))
print(datetime_object)
print(n)
print(nn)
now = datetime.today()
date_before = now - timedelta(days=10)
print(date_before)
if n<nn:
    print('a')

['2018-03-17', '10:48:33.156579']
2018-03-28 00:24:35
2018-06-25 10:48:33.156579
2018-06-25 10:48:33.156579
2018-06-15 10:48:33.157579


In [1]:
from modulenorm.Normalize import Normalize
from modulenorm.Tokenize import Tokenize
from modulenorm.SymSpell import SymSpell
from modulenorm.SymSpell2 import SymSpell2
from modulenorm.LanguageNgramModel import LanguageNgramModel
from modulenorm.MissingLetterModel import MissingLetterModel
import time

In [7]:
tok = Tokenize()
toktext = tok.WordTokenize('malam-malam gini enaknya ngapain ya.')
print(toktext)

['malam-malam', 'gini', 'enaknya', 'ngapain', 'ya.']


In [36]:
import re
import string

def _clean_text(text):
        '''Remove unwanted characters and extra spaces from the text'''
        text = re.sub(" [b-zB-Z] ", ' ', text) #except a or A remove all single char words
#         text = re.sub('[^0-9a-zA-Z]+', ' ', text) #remove all non alpha numeric chars
        text = re.sub('[ \t]+', ' ', text) #remove continuous space/tabs
        text = re.sub(r'\n', ' ', text) 
        text = re.sub(r'[{}_*>()\\%+=\[\]]','', text)
        text = re.sub('a0','', text)
        text = re.sub('\'92t','\'t', text)
        text = re.sub('\'92s','\'s', text)
        text = re.sub('\'92m','\'m', text)
        text = re.sub('\'92ll','\'ll', text)
        text = re.sub('\'91','', text)
        text = re.sub('\'92','', text)
        text = re.sub('\'93','', text)
        text = re.sub('\'94','', text)
        text = re.sub('\.','. ', text)
        text = re.sub('\!','! ', text)
        text = re.sub('\?','? ', text)
        text = re.sub(' +',' ', text)
        text = re.sub('\s+',' ', text)
        text = re.sub('[0-9]+','', text)
        try:
            text1=unidecode(str(text))
        except:
            return text
    
        return text1
    
def remove_hashmention(text):
    result = []
    text = text.split(' ')
    for t in text:
        if t.startswith('#') or t.startswith('@'):
            continue
        else:
            result.append(t)
    return ' '.join(result)

def remove_punctuation(text):
    remove = string.punctuation
    remove = remove.replace("-", "")
    translator = str.maketrans(remove, ' '*len(remove))
    return text.translate(translator)

In [54]:
row = "Rencang Muda,tempat nongkrong yang ramai di boyolali  menurut kalian di mana??alun-alun,simpang lima atau di mana?? :))) lanjut baca.."
# normalize
norm = Normalize()
text_norm = norm.remove_ascii_unicode(row)
text_norm = norm.remove_rt_fav(text_norm)
text_norm = norm.lower_text(text_norm)
# text_norm = norm.repeat_char_modify(text_norm)
# text_norm = norm.remove_elipsis(text_norm)
text_norm = norm.remove_newline(text_norm)
text_norm = norm.remove_url(text_norm)
text_norm = norm.remove_emoticons(text_norm)
# text_norm = norm.remove_hashtags_mentions(text_norm)
text_norm = remove_hashmention(text_norm)
text_norm = remove_punctuation(text_norm)

print(text_norm)

rencang muda tempat nongkrong yang ramai di boyolali  menurut kalian di mana  alun-alun simpang lima atau di mana    lanjut baca  


In [55]:
toktext = tok.WordTokenize(text_norm, removepunct=True)
print(toktext)

['rencang', 'muda', 'tempat', 'nongkrong', 'yang', 'ramai', 'di', 'boyolali', 'menurut', 'kalian', 'di', 'mana', 'alun-alun', 'simpang', 'lima', 'atau', 'di', 'mana', 'lanjut', 'baca']


In [39]:
ss = SymSpell2(max_dictionary_edit_distance=3)

In [40]:
ss.load_complete_model_from_json("resource/corpus_complete_model.json",encoding="ISO-8859-1")

Loading dictionary...
Processing dictionary...
Copied 94815 words to master dictionary...
Copied 679555 hashes to master dictionary...


In [41]:
result = []
for t in toktext:
    suggestion_list = ss.lookup(phrase=t, verbosity=1, max_edit_distance=2)
    if len(suggestion_list) > 0:
        result.append(str(suggestion_list[0]).split(':')[0])
    else:
        result.append(t)

print(' '.join(result))

rencang muda tempat nongkrong yang ramai di boyolali menurut kalian di mana alun-alun simpang lima atau di mana


In [53]:
words = "a vavav"
s = words = re.split('(\w+)?', words)
s = [w.strip().lower() for w in words if w.strip() and len(w)>1]
print(s)

['vavav']


  return _compile(pattern, flags).split(string, maxsplit)


In [None]:
import re
# Train Noisy Channel
with open('resource/opensubtitle.txt', encoding = 'utf-8') as f:
    text_id = f.read()

In [None]:
# leave only letters and spaces in the text
text_id2 = re.sub(r'[^a-z ]+', '', text_id.lower().replace('\n', ' '))
all_letters = ''.join(list(sorted(list(set(text_id2)))))
print(repr(all_letters))

In [None]:
# Prepare training sample for the abbreviation model 
missing_set =  (
    [(all_letters, '-' * len(all_letters))] * 3 # all chars missing
    + [(all_letters, all_letters)] * 10 # all chars are NOT missing
    + [('aeiouy', '------')] * 30 # only vowels are missing
)

In [None]:
%%time

# Train the both models
big_lang_m = LanguageNgramModel(order=4, smoothing=0.001, recursive=0.01)
big_lang_m.fit(text_id2)
big_err_m = MissingLetterModel(order=0, smoothing_missed=0.1)
big_err_m.fit(missing_set)

In [None]:
%%time

# without noisy channel
idx = 0
df_tweets['normalize'] = None
for row in df_tweets['text']:
    start_tweet = time.clock()
    # normalize
    norm = Normalize()
    text_norm = norm.remove_ascii_unicode(row)
    text_norm = norm.remove_rt_fav(text_norm)
    text_norm = norm.lower_text(text_norm)
    text_norm = norm.repeat_char_modify(text_norm)
    text_norm = norm.remove_elipsis(text_norm)
    text_norm = norm.remove_newline(text_norm)
    text_norm = norm.remove_url(text_norm)
    text_norm = norm.remove_emoticons(text_norm)
    text_norm = norm.remove_hashtags_mentions(text_norm)
    
    # tokenize
    tok = Tokenize()
    text_norm = tok.WordTokenize(text_norm)
    
    # spell correction
    temp_sentence = []
    for token in text_norm:
        choosen_word = symspell.get_suggestions(token)
        
        # option if there is no suggestions
        if len(choosen_word) > 0:
            temp_sentence.append(choosen_word)
        else:
            temp_sentence.append(token)
    
    text_norm = ' '.join(temp_sentence)
    df_tweets['normalize'][idx] = text_norm
    print('tweets', idx, 'selesai', time.clock()-start_tweet, 'seconds')
    idx += 1

In [None]:
%%time

idx = 0
df_tweets['normalize'] = None
for row in df_tweets['text']:
    start_tweet = time.clock()
    # normalize
    norm = Normalize()
    text_norm = norm.remove_ascii_unicode(row)
    text_norm = norm.remove_rt_fav(text_norm)
    text_norm = norm.lower_text(text_norm)
    text_norm = norm.repeat_char_modify(text_norm)
    text_norm = norm.remove_elipsis(text_norm)
    text_norm = norm.remove_newline(text_norm)
    text_norm = norm.remove_url(text_norm)
    text_norm = norm.remove_emoticons(text_norm)
    text_norm = norm.remove_hashtags_mentions(text_norm)
    
    # tokenize
    tok = Tokenize()
    text_norm = tok.WordTokenize(text_norm)
    
    # spell correction
    temp_sentence = []
    for token in text_norm:
        if len(token) <= 3 and (not(any(char.isdigit() for char in token))):
            nc = norm.noisy_channel(token, big_lang_m, big_err_m)
            max_values = max(nc.values())
            choosen_word = list(nc.keys())[list(nc.values()).index(max_values)]
        else:
            choosen_word = symspell.get_suggestions(token)
        
        # option if there is no sugestions
        if len(choosen_word) > 0:
            temp_sentence.append(choosen_word)
        else:
            temp_sentence.append(token)
    
    text_norm = ' '.join(temp_sentence)
    df_tweets['normalize'][idx] = text_norm
    print('tweets', idx, 'selesai', time.clock()-start_tweet, 'seconds')
    idx += 1

In [None]:
df_tweets['normalize'].to_csv('normalize_export.csv', header=False, index=False)

In [1]:
list_tweets = [[1, 'afaf'], [2, 'fafasfba'], [3, 'fasf kvams']]

In [4]:
for lt in list_tweets:
    print(lt[0])

1
2
3


In [10]:
bla = [1, 'faf']
list_tweets.append(bla)
list_tweets

[[1, 'afaf'],
 [2, 'fafasfba'],
 [3, 'fasf kvams'],
 [[4, 'afafzzz'], [5, 'faaazzfasfba']],
 [[4, 'afafzzz'], [5, 'faaazzfasfba']],
 [[4, 'afafzzz'], [5, 'faaazzfasfba']],
 [1, 'faf']]

In [8]:
list_tweets.append(add)

In [9]:
list_tweets

[[1, 'afaf'],
 [2, 'fafasfba'],
 [3, 'fasf kvams'],
 [[4, 'afafzzz'], [5, 'faaazzfasfba']],
 [[4, 'afafzzz'], [5, 'faaazzfasfba']],
 [[4, 'afafzzz'], [5, 'faaazzfasfba']]]