# DATA PRE-PROCESSING

### 1. Load CSV to DataFrame

In [3]:
import pandas as pd
import re
import nltk

# pd.set_option('display.max_rows', None)
df = pd.read_csv('data_tweets.csv', encoding='utf-8')
df.drop(columns=['is_replied', 'reply_to_users', 'parent_tweet_id', 'reply_to_users', 'username', 'text_html', 
                 'timestamp_epochs', 'tweet_url', 'user_id','is_reply_to','hashtags','links','has_media',
                 'img_urls','video_url'], inplace=True)
df = df.reindex(columns=['tweet_id', 'screen_name','timestamp','text','likes','retweets','replies'])
df.sort_values(by='timestamp',ascending=True, inplace=True)

tweet_id        int64
screen_name    object
timestamp      object
text           object
likes           int64
retweets        int64
replies         int64
dtype: object

### 2. Text Cleaning

In [2]:
# Case Folding
df.text = df.text.str.lower()

# Hapus Angka
df.text = df.text.replace({r"\d+":''}, regex=True)

# Hapus Link
df.text = df.text.replace({r"http\S+":''}, regex=True)
df.text = df.text.replace({r"pic.twitter.com[A-Za-z0-9!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]+":''
                          }, regex=True)

# Hapus Titik
df.text = df.text.replace({r"[.]":' '}, regex=True)

# Hapus Mention
df.text = df.text.replace({'@[A-Za-z0-9_]+':''}, regex=True)

# Hapus Hashtags
df.text = df.text.replace({'#[A-Za-z0-9_]+':''}, regex=True)

# Hapus WhiteSpace
df.text = df.text.replace({"\r":' '}, regex=True)
df.text = df.text.replace({"\n":' '}, regex=True)

# Hapus Semua Karakter kecuali huruf dan spasi
df.text = df.text.replace({'[^a-z\s]':' '}, regex=True)

# Hapus Kata yang kurang dari 3 huruf
df.text = df.text.replace({r'\b\w{1,3}\b':''}, regex=True)

# Hapus multi spasi
df.text = df.text.replace({' +':' '}, regex=True)
df.text = df.text.str.strip()

# Hapus Data Duplikat
df.drop_duplicates(subset ="tweet_id", keep = False, inplace = True) 

# Hapus Blank Row
df = df[df.text != '']
df = df.dropna(subset = ['text'])

df

Unnamed: 0,tweet_id,screen_name,timestamp,text,likes,retweets,replies
1855,1079908816664244224,UMMcampus,2019-01-01 01:15:00,strengthening interfaith tolerance kaca visits...,3,0,0
1854,1079954367065669632,UMMcampus,2019-01-01 04:16:00,strengthen branding gresik typical food with f...,4,4,0
1853,1079985069035610112,UMMcampus,2019-01-01 06:18:00,year batu mayor sociology holds dialogue with ...,0,0,0
1852,1080009658172039168,YuniarDwi_R,2019-01-01 07:55:42,dipanggil arap,0,0,0
1851,1080111915156725760,lilianawlr,2019-01-01 14:42:02,hiyaaaa,0,0,0
...,...,...,...,...,...,...,...
1543,1204380548103229440,kedonyan,2019-12-10 12:41:13,sopan,0,0,0
1542,1204423283287871488,antakentang,2019-12-10 15:31:02,dong,0,0,0
1540,1204591633045569538,UMMcampus,2019-12-11 02:40:00,semnas bahas masa depan fintech indonesia,3,0,0
1539,1204599004043939840,UMMcampus,2019-12-11 03:09:17,ciptakan helm pintar mahasiswa sabet bronze me...,11,1,0


### 3. StopWord Remove

In [4]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

stopword = StopWordRemoverFactory().create_stop_word_remover()
df['text'] = df.apply(lambda row: stopword.remove(row['text']), axis=1)

df

Unnamed: 0,tweet_id,screen_name,timestamp,text,likes,retweets,replies
1855,1079908816664244224,UMMcampus,2019-01-01 01:15:00,strengthening interfaith tolerance kaca visits...,3,0,0
1854,1079954367065669632,UMMcampus,2019-01-01 04:16:00,strengthen branding gresik typical food with f...,4,4,0
1853,1079985069035610112,UMMcampus,2019-01-01 06:18:00,year batu mayor sociology holds dialogue with ...,0,0,0
1852,1080009658172039168,YuniarDwi_R,2019-01-01 07:55:42,dipanggil arap,0,0,0
1851,1080111915156725760,lilianawlr,2019-01-01 14:42:02,hiyaaaa,0,0,0
...,...,...,...,...,...,...,...
1543,1204380548103229440,kedonyan,2019-12-10 12:41:13,sopan,0,0,0
1542,1204423283287871488,antakentang,2019-12-10 15:31:02,dong,0,0,0
1540,1204591633045569538,UMMcampus,2019-12-11 02:40:00,semnas bahas masa depan fintech indonesia,3,0,0
1539,1204599004043939840,UMMcampus,2019-12-11 03:09:17,ciptakan helm pintar mahasiswa sabet bronze me...,11,1,0


### 4. Stemming

In [5]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()
df['text'] = df.apply(lambda row: stemmer.stem(row['text']), axis=1)

df

Unnamed: 0,tweet_id,screen_name,timestamp,text,likes,retweets,replies
1855,1079908816664244224,UMMcampus,2019-01-01 01:15:00,strengthening interfaith tolerance kaca visits...,3,0,0
1854,1079954367065669632,UMMcampus,2019-01-01 04:16:00,strengthen branding gresik typical food with f...,4,4,0
1853,1079985069035610112,UMMcampus,2019-01-01 06:18:00,year batu mayor sociology holds dialogue with ...,0,0,0
1852,1080009658172039168,YuniarDwi_R,2019-01-01 07:55:42,panggil arap,0,0,0
1851,1080111915156725760,lilianawlr,2019-01-01 14:42:02,hiyaaaa,0,0,0
...,...,...,...,...,...,...,...
1543,1204380548103229440,kedonyan,2019-12-10 12:41:13,sopan,0,0,0
1542,1204423283287871488,antakentang,2019-12-10 15:31:02,dong,0,0,0
1540,1204591633045569538,UMMcampus,2019-12-11 02:40:00,semnas bahas masa depan fintech indonesia,3,0,0
1539,1204599004043939840,UMMcampus,2019-12-11 03:09:17,cipta helm pintar mahasiswa sabet bronze medal...,11,1,0


### 5. Tokenizing

In [6]:
from nltk.tokenize import word_tokenize 
#Hapus Blank Row
df = df[df.text != '']
df = df.dropna(subset = ['text'])

#Proses Tokenizing
df['text'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)
df

Unnamed: 0,tweet_id,screen_name,timestamp,text,likes,retweets,replies
1855,1079908816664244224,UMMcampus,2019-01-01 01:15:00,"[strengthening, interfaith, tolerance, kaca, v...",3,0,0
1854,1079954367065669632,UMMcampus,2019-01-01 04:16:00,"[strengthen, branding, gresik, typical, food, ...",4,4,0
1853,1079985069035610112,UMMcampus,2019-01-01 06:18:00,"[year, batu, mayor, sociology, holds, dialogue...",0,0,0
1852,1080009658172039168,YuniarDwi_R,2019-01-01 07:55:42,"[panggil, arap]",0,0,0
1851,1080111915156725760,lilianawlr,2019-01-01 14:42:02,[hiyaaaa],0,0,0
...,...,...,...,...,...,...,...
1543,1204380548103229440,kedonyan,2019-12-10 12:41:13,[sopan],0,0,0
1542,1204423283287871488,antakentang,2019-12-10 15:31:02,[dong],0,0,0
1540,1204591633045569538,UMMcampus,2019-12-11 02:40:00,"[semnas, bahas, masa, depan, fintech, indonesia]",3,0,0
1539,1204599004043939840,UMMcampus,2019-12-11 03:09:17,"[cipta, helm, pintar, mahasiswa, sabet, bronze...",11,1,0


### 6. Export to CSV

In [7]:
export_csv = df.to_csv (r'data_preprocessing.csv', index = None, header=True)