#**Twitter Sentiment Analysis During Indonesia Presidential Election 2014**
Post-Test for DSI Melek Sentiment Online Workshop


In [448]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [449]:
HOME_DIR = '/content/drive/My Drive/dataset/dsi_sentimen/'

In [450]:
import pandas as pd

**Load Data Tweet**

In [451]:
df = pd.read_csv(HOME_DIR+'data/Capres2014-2.0.csv',encoding='latin1')
df_ori = df.copy()
df = df[['Isi_Tweet','Sentimen']]

In [452]:
df['Isi_Tweet'].head(10).values

array(['@FahrelStv Gak setuju Jokowi jadi Cawapres.. CAPRES JOKOWI harga mati...',
       'capres jokowi,wacapres abraham samad. gubernur ahok. gua yakin koruptor abissss !',
       'RT @RintisBautista: capres jokowi,wacapres abraham samad. gubernur ahok. gua yakin koruptor abissss !',
       'capres ( Prabowo ) and cawapres ( jokowi ) and gubDKI (Ahok) mantap lanjutkan !!! buat pak presiden (SBY) bubarkan saja FPI',
       'RT @Franliiiii: capres ( Prabowo ) and cawapres ( jokowi ) and gubDKI (Ahok) mantap lanjutkan !!! buat pak presiden (SBY) bubarkan saja FPI',
       'jd skenarionya gini. 2014 biar Prabowo jd presiden, Jokowi tetepgubernur. kalo Jakarta berhasil gak usah nunggu 2019 buat gantiin Prabowo',
       'SBY mantan TNI, dan Calon Presiden Prabowo subianto adalah Mantan KOPASSUS.. Krn anggoto TNI disiplin.. Maka dari itu, smw presiden TEGAS!!!',
       '@Prabowo08 mohon untuk pak Presiden Prabowo Subiyanto, tolong jangan jadikan kami "Perangkat Desa" sebagai tumbal kebijaka

**Load Data Stopwords**

In [453]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [454]:
from nltk.corpus import stopwords
sw = stopwords.words('indonesian')
sw.extend(stopwords.words('english'))

In [455]:
# load stopwords indonesia
stp = open(HOME_DIR+'data/twitter_stp.dic')
stp_words = []
item = stp.readline()
while item != '':
  stp_words.append(item[:-1])
  item = stp.readline()  

In [456]:
stp_words[0:10]

['a', 'ada', 'adalah', 'ah', 'akan', 'aku', 'all', 'an', 'and', 'apa']

In [457]:
len(stp_words)

987

In [458]:
sw += stp_words
len(sw)

1924

In [459]:
sw = set(sw)

In [460]:
len(sw)

1419

**Load Data Kata Singkatan**

In [461]:
# load data kata singkatan
stp = open(HOME_DIR+'data/singkatankata.dic')
kata_asli = []
singkatan = []
dict_singkatan = {}
item = stp.readline()
while item != '':
  singkatan.append(item[:-1].split('\t')[0])
  kata_asli.append(item[:-1].split('\t')[-1])
  dict_singkatan[item[:-1].split('\t')[0].lower()] = item[:-1].split('\t')[-1].lower()
  item = stp.readline()

In [462]:
# dict_singkatan

In [463]:
# kata_asli[0:5]

In [464]:
# singkatan[0:5]

**Load Data Kata Dasar dan Sentimen**

In [465]:
df_seed = pd.read_csv(HOME_DIR+'data/seed_all.csv')
df_seed.columns = df.columns

In [466]:
df_seed.head()

Unnamed: 0,Isi_Tweet,Sentimen
0,diakui,positif
1,penghargaan,positif
2,kecerdasan,positif
3,kekaguman,positif
4,mengagumi,positif


In [467]:
df_seed['Sentimen'].value_counts()

negatif    517
positif    291
Name: Sentimen, dtype: int64

In [468]:
dict_label = {'Sentimen':{'positif': 1,
                            'negatif': -1}}

In [469]:
df_seed.replace(dict_label, inplace=True)
df_seed['Sentimen'].value_counts()

-1    517
 1    291
Name: Sentimen, dtype: int64

In [470]:
df_seed.head()

Unnamed: 0,Isi_Tweet,Sentimen
0,diakui,1
1,penghargaan,1
2,kecerdasan,1
3,kekaguman,1
4,mengagumi,1


In [471]:
len(df_seed)

808

In [472]:
len(df)

3356

In [473]:
df_process = df.append(df_seed, ignore_index = True) 

In [474]:
len(df_process)

4164

In [475]:
df_process['Sentimen'].value_counts()

 0    1467
 1    1415
-1    1282
Name: Sentimen, dtype: int64

In [476]:
# mengubah semua value ke lowercase
df_process['Isi_Tweet'] = df_process['Isi_Tweet'].str.lower()

In [477]:
df_process.head()

Unnamed: 0,Isi_Tweet,Sentimen
0,@fahrelstv gak setuju jokowi jadi cawapres.. c...,1
1,"capres jokowi,wacapres abraham samad. gubernur...",1
2,"rt @rintisbautista: capres jokowi,wacapres abr...",1
3,capres ( prabowo ) and cawapres ( jokowi ) and...,1
4,rt @franliiiii: capres ( prabowo ) and cawapre...,1


In [478]:
# mengubah kata singkatan menjadi kata asli
def expand_dict(x, dct):
  tw = x.split()
  return ' '.join([dct.get(item, item) for item in tw])

df_process['Isi_Tweet'] = df_process['Isi_Tweet'].apply(lambda x: expand_dict(x,dict_singkatan))
df_process_original = df_process.copy()

**Inisiasi Vectorizer**

In [479]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# vect = TfidfVectorizer(stop_words=sw)
# vect = CountVectorizer(stop_words=sw)
vect = TfidfVectorizer(min_df=0.005, max_df=0.99, stop_words=sw, token_pattern='\\b[a-zA-Z][a-zA-Z][a-zA-Z]+ \\b')


x = df_process['Isi_Tweet']
y = df_process['Sentimen']

# vectorizer
x = vect.fit_transform(x)

# secara default 75 25
x_train, x_test, y_train, y_test = train_test_split(x,y,stratify=y)

**Inisiasi Model**

In [480]:
# machine learning model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

**Evaluasi Model**

In [481]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test)

print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

          -1       0.81      0.69      0.74       374
           0       0.83      0.73      0.78       417
           1       0.54      0.76      0.63       250

    accuracy                           0.73      1041
   macro avg       0.73      0.73      0.72      1041
weighted avg       0.75      0.73      0.73      1041



In [482]:
from sklearn.model_selection import cross_val_score
# x = pd.DataFrame(x.todense(), columns=vect.get_feature_names())
cross_val_score(model,x,y,cv=5).mean()

0.42579531812725085

**Hyperparameter Tuning**

In [483]:
# from sklearn.model_selection import GridSearchCV
# param_grid = {'criterion' : ['gini','entropy'],
#               'max_features' : ['auto','sqrt','log2'],
#               'n_estimators' : [100,200,300],
#               'class_weight' : ['balanced','balanced_subsample']}x  
# gsv = GridSearchCV(RandomForestClassifier(),param_grid=param_grid,n_jobs=4,cv=5)
# gsv.fit(x,y)
# print(gsv.best_params_)
# print(gsv.best_score_)

**Inisiasi Model dengan hasil dari hyperparameter tuning**

In [484]:
model = RandomForestClassifier(class_weight='balanced',
                               criterion = 'entropy',
                               max_features = 'sqrt',
                               n_estimators = 300)
# model = RandomForestClassifier(**gsv.best_params_)
model.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='entropy', max_depth=None, max_features='sqrt',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

**Evaluasi Model**

In [485]:
y_pred = model.predict(x_test)
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

          -1       0.80      0.69      0.74       368
           0       0.83      0.76      0.79       405
           1       0.58      0.77      0.66       268

    accuracy                           0.74      1041
   macro avg       0.74      0.74      0.73      1041
weighted avg       0.76      0.74      0.74      1041



In [None]:
cross_val_score(model,x,y,cv=5).mean()

Hasil akhir yang didapatkan akurasinya cukup rendah kedepannya mungkin bisa dikembangkan dengan **menggunakan lebih banyak data** lagi dan juga menggunakan **algoritma lain** dengan tujuan untuk mengetahui mana hasil yang terbaik.

**Pipeline**

In [None]:
from sklearn.pipeline import Pipeline
# bisa otomatis fit
pipeline = Pipeline([
                     ('vect',vect),
                     ('clf',model)
])

In [None]:
x = df_process['Isi_Tweet']
y = df_process['Sentimen']

# secara default 75 25
x_train, x_test, y_train, y_test = train_test_split(x,y,stratify=y)

In [None]:
pipeline.fit(x_train,y_train)

In [None]:
y_pred = pipeline.predict(x_test)
print(classification_report(y_pred,y_test))

**Export pickle**

In [None]:
import pickle

In [None]:
# dump pipeline to pickle
pickle.dump(pipeline,open('sentiment_model.pkl', 'wb'))

In [None]:
# load the model from disk
loaded_model = pickle.load(open('sentiment_model.pkl', 'rb'))
loaded_model.fit(x_train,y_train)

In [None]:
y_pred = pipeline.predict(x_test)
print(classification_report(y_pred,y_test))

In [None]:
result = loaded_model.score(x_test,y_test)
print(result)

In [None]:
result = loaded_model.score(x_train,y_train)
print(result)

In [None]:
# End

#**Visualisasi**

In [None]:
# df_process_original.copy().head()
dfp = df_process_original.copy()

In [None]:
list_user = []
list_word = []
import re
for i in dfp['Isi_Tweet']:
  for j in i.split():
    if '@' in j:
      list_user.append(j.replace(':',''))
    else:
      if j != '':
        list_word.append(re.sub('[^A-Za-z0-9]+', '', j))
len(list_user)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
srs = pd.DataFrame({'username' : list_user})['username'].value_counts()
srs.head(5).sort_values(ascending=True).plot(kind='barh',figsize=(15,5),title='Akun Twitter Yang Paling Sering Disebut')

In [None]:
srs = df['Sentimen'].value_counts().rename({0: 'Neutral', 1: 'Positive',-1:'Negative'})
srs.plot(kind='bar',title='Sentimen',rot=0)

In [None]:
lw = []
for i in list_word:
  if i not in sw:
    lw.append(i)
len(lw)
srs = pd.DataFrame({'word' : lw})['word'].value_counts().drop(labels=[''])
srs.head(10).sort_values(ascending=True).plot(kind='barh',figsize=(15,5),title='Kata yang paling sering muncul')

In [None]:
from wordcloud import WordCloud

In [None]:
dff = pd.DataFrame({'word' : lw})

In [None]:
wordcloud2 = WordCloud(width=800, height=400).generate(' '.join(dff['word']))
plt.imshow(wordcloud2)
plt.axis("off")
plt.show()

In [None]:
srs.index

In [None]:
df.Sentimen.values

In [None]:
import numpy as np
np.array(lw)

In [None]:
len(list_word)

**Experiment**

In [None]:
aa = ' '.join(df['Isi_Tweet'].loc[df['Sentimen'] == -1].values.tolist())
bb = ' '.join(df['Isi_Tweet'].loc[df['Sentimen'] == 1].values.tolist())
cc = ' '.join(df['Isi_Tweet'].loc[df['Sentimen'] == 0].values.tolist())
dd = ' '.join(df_seed['Isi_Tweet'].loc[df_seed['Sentimen'] == -1][0:500].values.tolist())

In [None]:
dd

In [None]:
# inverence
new_tweet = [aa,
             bb,
             cc,
             dd,
             'pedih masam memalukan membenci kebencian mengerikan kasar pelupa penyalahgunaan']
pipeline.predict(new_tweet)

In [None]:
loaded_model.predict(new_tweet)

In [None]:
df_seed['Isi_Tweet'].loc[df['Sentimen'] == -1][0:10]

**Trash**

In [None]:
# vect.get_feature_names()
# srs.head()
# srs = pd.DataFrame(x.todense(), columns=vect.get_feature_names())
# # # load the model from disk
# # loaded_model = pickle.load(open(HOME_DIR+'sentiment_model.pkl', 'rb'))
# # result = loaded_model.score(df['Isi_Tweet'][0:200],df['Sentimen'][0:200])
# # print(result)
# # import pickle
# # # dump pipeline to pickle
# # pickle.dump(pipeline,open(HOME_DIR+'sentiment_model.pkl', 'wb'))
# # # pickle.dump(pipeline,open('sentiment_model.pkl', 'wb'))
# from sklearn.metrics import classification_report
# print(classification_report(y_pred,y_test))
# y_pred = pipeline.predict(x_test)
# pipeline.fit(df['Isi_Tweet'],df['Sentimen'])
# # pipeline.fit(x_train,y_train)
# param_grid = {'n_neighbors': np.arange(3,100)}
# gsv = GridSearchCV(KNeighborsClassifier(),param_grid=param_grid,n_jobs=4,cv=5)
# gsv.fit(x,y)
# print(gsv.best_params_)
# print(gsv.best_score_)
# model = RandomForestClassifier(criterion='entropy',max_features='auto')
# model.fit(x,y)
# from sklearn.metrics import classification_report
# pipeline.fit(x_train,y_train)

# y_pred = pipeline.predict(x_test)
# from sklearn.metrics import classification_report
# print(classification_report(y_pred,y_test))