# Preprocessing
Hal yang dilakukan:
1. Common Preprocessing
    1. Remove ASCII and Unicode.
    2. Tokenization
    3. Case folding, convert into lowercase
    4. Repeated dot (sedih... -> sedih.)
    5. Repeated character ('hehe :)))' -> 'hehe :)')
    6. Remove elipsis (lanjut baca... -> lanjut baca)
    7. Repeated word that has meaning ('malam malam' -> 'malam-malam')
    8. Remove newline
2. Specific Preprocessing
    1. Special symbols on Twitter, removing hashtag, mention, RT, and FAV
    2. Remove all emoticons
    3. Remove URL
    4. Spell checker using noisy channel approach

In [None]:
import pandas as pd

In [None]:
df_tweets = pd.read_csv('export.csv')
df_tweets.shape

In [None]:
df_tweets.head()

In [None]:
df_tweets['created_at'][0].split(' ')

In [None]:
from datetime import datetime, timedelta

datetime_object = datetime.strptime(df_tweets['created_at'][0], '%Y-%m-%d %H:%M:%S')
d = datetime.today() - timedelta(days=100, microseconds=0)
n = datetime.today()
nn = datetime.today()
print(str(d).split(' '))
print(datetime_object)
print(n)
print(nn)
now = datetime.today()
date_before = now - timedelta(days=10)
print(date_before)
if n<nn:
    print('a')

In [1]:
from modulenorm.Normalize import Normalize
from modulenorm.Tokenize import Tokenize
from modulenorm.SymSpell import SymSpell
from modulenorm.SymSpell2 import SymSpell2
from modulenorm.LanguageNgramModel import LanguageNgramModel
from modulenorm.MissingLetterModel import MissingLetterModel
import time

In [3]:
tok = Tokenize()
toktext = tok.WordTokenize('faksfnSAFk knaskvna msalkfm askfmlas #kvadnklcasc#%#%.', removepunct=True)
print(toktext)

['faksfnsafk', 'knaskvna', 'msalkfm', 'askfmlas', 'kvadnklcasc']


In [None]:
%%time
# SymSpell Create Dictionary
symspell2 = SymSpell2()
symspell2.create_dictionary("resource/kamus.txt")

In [None]:
import json

symspell2 = SymSpell2()
a = symspell2.clean_and_create_dictionary("resource/kamus.txt")
with open('file2.txt', 'w') as file:
     file.write(json.dumps(a))

In [None]:
import pickle

a = symspell.create_dictionary("resource/novel.txt")
f = open("file.pkl","wb")
pickle.dump(a, f)
f.close()

In [None]:
print(type(symspell))

In [None]:
import re
# Train Noisy Channel
with open('resource/opensubtitle.txt', encoding = 'utf-8') as f:
    text_id = f.read()

In [None]:
# leave only letters and spaces in the text
text_id2 = re.sub(r'[^a-z ]+', '', text_id.lower().replace('\n', ' '))
all_letters = ''.join(list(sorted(list(set(text_id2)))))
print(repr(all_letters))

In [None]:
# Prepare training sample for the abbreviation model 
missing_set =  (
    [(all_letters, '-' * len(all_letters))] * 3 # all chars missing
    + [(all_letters, all_letters)] * 10 # all chars are NOT missing
    + [('aeiouy', '------')] * 30 # only vowels are missing
)

In [None]:
%%time

# Train the both models
big_lang_m = LanguageNgramModel(order=4, smoothing=0.001, recursive=0.01)
big_lang_m.fit(text_id2)
big_err_m = MissingLetterModel(order=0, smoothing_missed=0.1)
big_err_m.fit(missing_set)

In [None]:
%%time

# without noisy channel
idx = 0
df_tweets['normalize'] = None
for row in df_tweets['text']:
    start_tweet = time.clock()
    # normalize
    norm = Normalize()
    text_norm = norm.remove_ascii_unicode(row)
    text_norm = norm.remove_rt_fav(text_norm)
    text_norm = norm.lower_text(text_norm)
    text_norm = norm.repeat_char_modify(text_norm)
    text_norm = norm.remove_elipsis(text_norm)
    text_norm = norm.remove_newline(text_norm)
    text_norm = norm.remove_url(text_norm)
    text_norm = norm.remove_emoticons(text_norm)
    text_norm = norm.remove_hashtags_mentions(text_norm)
    
    # tokenize
    tok = Tokenize()
    text_norm = tok.WordTokenize(text_norm)
    
    # spell correction
    temp_sentence = []
    for token in text_norm:
        choosen_word = symspell.get_suggestions(token)
        
        # option if there is no suggestions
        if len(choosen_word) > 0:
            temp_sentence.append(choosen_word)
        else:
            temp_sentence.append(token)
    
    text_norm = ' '.join(temp_sentence)
    df_tweets['normalize'][idx] = text_norm
    print('tweets', idx, 'selesai', time.clock()-start_tweet, 'seconds')
    idx += 1

In [None]:
%%time

idx = 0
df_tweets['normalize'] = None
for row in df_tweets['text']:
    start_tweet = time.clock()
    # normalize
    norm = Normalize()
    text_norm = norm.remove_ascii_unicode(row)
    text_norm = norm.remove_rt_fav(text_norm)
    text_norm = norm.lower_text(text_norm)
    text_norm = norm.repeat_char_modify(text_norm)
    text_norm = norm.remove_elipsis(text_norm)
    text_norm = norm.remove_newline(text_norm)
    text_norm = norm.remove_url(text_norm)
    text_norm = norm.remove_emoticons(text_norm)
    text_norm = norm.remove_hashtags_mentions(text_norm)
    
    # tokenize
    tok = Tokenize()
    text_norm = tok.WordTokenize(text_norm)
    
    # spell correction
    temp_sentence = []
    for token in text_norm:
        if len(token) <= 3 and (not(any(char.isdigit() for char in token))):
            nc = norm.noisy_channel(token, big_lang_m, big_err_m)
            max_values = max(nc.values())
            choosen_word = list(nc.keys())[list(nc.values()).index(max_values)]
        else:
            choosen_word = symspell.get_suggestions(token)
        
        # option if there is no sugestions
        if len(choosen_word) > 0:
            temp_sentence.append(choosen_word)
        else:
            temp_sentence.append(token)
    
    text_norm = ' '.join(temp_sentence)
    df_tweets['normalize'][idx] = text_norm
    print('tweets', idx, 'selesai', time.clock()-start_tweet, 'seconds')
    idx += 1

In [None]:
df_tweets['normalize'].to_csv('normalize_export.csv', header=False, index=False)