In [None]:
from nltk.corpus import stopwords
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv('datasets/dataset_stemming.csv')
df.head(10)

Unnamed: 0,tweet,label
0,jln jatibarupolisi tdk bs gertak gubernur eman...,anger
1,cewe lho kayak rasain sibuk jaga rasain sakit ...,anger
2,kepingin gudeg mbarek bu hj amad foto google s...,happy
3,jln jatibarubagian wilayah tn abangpengaturan ...,anger
4,sharing alam aja kemarin jam 1800 batalin tike...,happy
5,sekian thread baca thread aneh sih tulis sumpa...,anger
6,sharing temen tuh emg bgt saat lu ngerasa lu b...,happy
7,orang pake ponco jas hujan pake kasur ya gara2...,sadness
8,contoh yg gemar sudut teriak toleran tp gemar ...,anger
9,pulang udah h-4 lebaran dilema apa2 rumah leba...,sadness


In [None]:
df.label.unique()

array(['anger', 'happy', 'sadness', 'love', 'fear'], dtype=object)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
#preprocessing
print(df['tweet'].apply(lambda x: len(x.split(' '))).sum())

82064


In [None]:
special_character_remover = re.compile('[/(){}\[\]\|@,;]')
extra_symbol_remover = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [None]:
def clean_text(text):
    text = text.lower()
    text = special_character_remover.sub(' ', text)
    text = extra_symbol_remover.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text
    
df['tweet'] = df['tweet'].apply(clean_text)

In [None]:
print(df['tweet'].apply(lambda x: len(x.split(' '))).sum())

78837


In [None]:
from sklearn.model_selection import train_test_split
X = df.tweet
y = df.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 0)

In [None]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape


((3501,), (876,), (3501,), (876,))

In [None]:
#logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

lr = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', LogisticRegression()),
              ])

lr.fit(X_train,y_train)
y_pred1 = lr.predict(X_test)

print(f"Accuracy is : {accuracy_score(y_pred1,y_test)}")

Accuracy is : 0.6541095890410958


In [None]:
from sklearn.naive_bayes import MultinomialNB


naivebayes = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
naivebayes.fit(X_train, y_train)

y_pred = naivebayes.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')

accuracy 0.5958904109589042


In [None]:
from xgboost import XGBClassifier

xgboost = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', XGBClassifier()),
              ])
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')



accuracy 0.6015981735159818


In [None]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', SGDClassifier()),
              ])
sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print(f'accuracy {accuracy_score(y_pred,y_test)}')

accuracy 0.6221461187214612
