In [1]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
current_directory = os.path.dirname(os.path.abspath("Data_Cleaning.ipynb"))

In [3]:
#Fucntion to Clean tweet data
def Clean(text):
    #lowercase for every word
    text = text.lower()

    #Clean Pattern
    #remove USER
    text = re.sub(r'user', ' ', text)
    #remove 'RT'
    text = re.sub(r'rt', ' ', text)
    #remove 'URL'
    text = re.sub(r'url', ' ', text)
    #remove HTTPS
    text = re.sub(r'https', ' ', text)
    #remove HTTP
    text = re.sub(r'http', ' ', text)
    #remove &amp
    text = re.sub(r'&amp', ' ', text)

    #Clean_Unnecessary_Character
    #remove \n or every word afte '\' with space
    text = re.sub(r'\\+[a-zA-Z0-9]+', ' ', text)
    #remove text emoji
    text = re.sub(r'[^a-zA-Z0-9\s]{2,}|:[a-zA-Z0-9]{0,}', ' ', text)
    #remove all unnecessary character 
    text = re.sub(r'[^0-9a-zA-Z\s]+', ' ', text)
    #remove all number
    text = re.sub(r'[0-9]+', ' ', text)
    #remove extra space
    text = re.sub(r'  +', ' ', text)
    #remove space at the start or the end of string
    text = re.sub(r'^ +| +$', '', text)
    
    return text

#tokenization Function
def tokenization(text):
    text = re.split('\W+', text)
    return text

#import file new_kamusalay.csv
kamus_alay = pd.read_csv(r"E:\BINAR\Binar-Gold-Challenge\Dataset\new_kamusalay.csv", 
                         encoding = 'ISO-8859-1', header = None)
kamus_alay = kamus_alay.rename(columns={0: 'kata alay', 1: 'arti kata'})

#Create dictionary from kamus_alay
kamus_alay_dict = dict(zip(kamus_alay['kata alay'], kamus_alay['arti kata']))

#normalization function to convert every word tha contain 'kata alay' to 'arti kata'
def normalization(text):
    newlist = []
    for word in text:
        if word in kamus_alay_dict:
            text = kamus_alay_dict[word]
            newlist.append(text)
        else:
            text = word
            newlist.append(text)
    return newlist

#remove stopwords

#list stopword from NLTK
stopword_list = ['yang', 'untuk', 'pada', 'ke', 'para', 'namun', 'menurut', 'antara', 
                 'dia', 'dua', 'ia','ia', 'seperti', 'jika', 'sehingga', 'kembali', 'dan', 
                 'ini', 'karena', 'kepada', 'oleh', 'saat', 'sementara', 'setelah', 'kami', 
                 'sekitar', 'bagi', 'serta', 'di', 'dari', 'telah', 'sebagai', 'masih', 'hal', 
                 'ketika', 'adalah', 'itu', 'dalam', 'bahwa', 'atau', 'kita', 'dengan', 'akan', 
                 'juga', 'ada', 'mereka', 'sudah', 'saya', 'terhadap', 'secara', 'agar', 'lain', 
                 'anda', 'begitu', 'mengapa', 'kenapa', 'yaitu', 'yakni', 'daripada', 'itulah', 
                 'lagi', 'maka', 'tentang', 'demi', 'dimana', 'kemana', 'pula', 'sambil', 'sebelum', 
                 'sesudah', 'supaya', 'guna', 'kah', 'pun', 'sampai', 'sedangkan', 'selagi', 'sementara', 
                 'tetapi', 'apakah', 'kecuali', 'sebab', 'seolah', 'seraya', 'seterusnya', 'dsb', 'dst', 
                 'dll', 'dahulu', 'dulunya', 'anu', 'demikian', 'mari', 'nanti', 'oh', 'ok', 'setiap', 
                 'sesuatu','saja', 'toh', 'walau', 'amat', 'apalagi', 'dengan', 'bahwa', 'oleh']

stopword_list.extend(["yg", "dg", "rt", "dgn", "ny", "d", 'klo',
                       'kalo', 'amp', 'biar', 'bikin', 'bilang',
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih',
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya',
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't',
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       'gue', 'yah', 'kayak'])

stopword_list = set(stopword_list)

#remove stopword function
def remove_stopwords(text):
    text = [word for word in text if word not in stopword_list]
    return text

#Find NUll 'String' Value 
def clean_non_existed(text):
    if text == '':
        return None
    else:
        return text

#function to run all the function
def clean_data(text):
    text = Clean(text)
    text = tokenization(text)
    text = normalization(text)
    text = remove_stopwords(text)
    text = ' '.join(text)
    text = clean_non_existed(text)

    return text

# Data Train

In [5]:
df_train = pd.read_table(current_directory + "\\Dataset\\train_preprocess.tsv.txt", sep='\t', header=None)
df_train = df_train.rename(columns={0: 'text', 1: 'label'})
df_train.head()

Unnamed: 0,text,label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


## Data Cleaning

In [6]:
df_train['text_clean'] = df_train.text.apply(clean_data)
df_train.head()

Unnamed: 0,text,label,text_clean
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung dimiliki pengusaha pabrik tahu puluhan ...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus k mmbri hujjah pak ai apa ha...
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis jalan sumatra bandung tempat ...
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia diri unboxing paket barang bagu...
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,aduh jadi mahasiswa jangan sombong dong kasih ...


In [7]:
df_train['label'].value_counts()

label
positive    6416
negative    3436
neutral     1148
Name: count, dtype: int64

## Drop Duplicated & Missing Value

In [8]:
df = df.dropna()
df = df.drop_duplicates(subset='text_clean')

In [9]:
df_train['label'].value_counts()

label
positive    6383
negative    3412
neutral     1138
Name: count, dtype: int64

## Saving Clean Data

In [11]:
df_train.to_csv("Clean_train_preprocess.csv", index=False)

# Data Prediction

In [19]:
df_tweet= pd.read_csv(current_directory + "\Dataset\data.csv", encoding='ISO-8859-1')
df_tweet.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,0,0,0,0,0,0,0,0,0,0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,0,0,0,0,0,0,0,0,0,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,0,1,1,0,0,0,0,0,1,0


## Data Cleaning

In [20]:
df_tweet['Tweet'] = df_tweet.Tweet.apply(clean_data)
df_tweet.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,di saat semua cowok berusaha melacak perhatian...,1,1,1,0,0,0,0,0,1,1,0,0
1,siapa telat memberi kamu edan sarap bergaul ci...,0,1,0,0,0,0,0,0,0,0,0,0
2,kadang aku berpikir aku tetap percaya tuhan pa...,0,0,0,0,0,0,0,0,0,0,0,0
3,aku aku matamu sipit tapi dilihat mana aku,0,0,0,0,0,0,0,0,0,0,0,0
4,kaum cebong kafir kelihatan dongoknya awal tam...,1,1,0,1,1,0,0,0,0,0,1,0


## Drop Duplicated & Missing Value

In [21]:
df_tweet.shape

(13169, 13)

In [22]:
df_tweet = df.dropna()
df_tweet = df.drop_duplicates(subset='Tweet')

In [23]:
df_tweet.shape

(12833, 13)

## Saving Clean Data

In [24]:
df_tweet.to_csv("Clean_data_Tweet.csv", index=False)