In [13]:
import pandas as pd
import regex as re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from Cleansing import clean
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score


In [14]:
train = pd.read_csv('dataset/train_preprocess.tsv.txt', names=['Kalimat','Sentiment'], sep='\t')
train.head()

Unnamed: 0,Kalimat,Sentiment
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [15]:
train['new_clean'] = train.Kalimat.apply(clean)

**EDA**

In [16]:
train.dtypes

Kalimat      object
Sentiment    object
new_clean    object
dtype: object

In [17]:
train.isna().sum()

Kalimat      0
Sentiment    0
new_clean    0
dtype: int64

In [18]:
train.duplicated().sum()

67

In [23]:
trclean = train.drop_duplicates()
trclean.head()

Unnamed: 0,Kalimat,Sentiment,new_clean,total_word,total_char
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung dimiliki pengusaha pabrik puluhan terke...,32,246
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus k212 mmbri hujjah partai diw...,12,72
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis jalan sumatra bandung nya nya...,22,132
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia nya unboxing paket barang nya b...,10,66
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,aduh mahasiswa sombong kasih kartu kuning bela...,16,114


In [24]:
train['total_word'] = train['new_clean'].apply(lambda x: len(x.split()))
train['total_char'] = train['new_clean'].apply(len)
train.head()

Unnamed: 0,Kalimat,Sentiment,new_clean,total_word,total_char
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung dimiliki pengusaha pabrik puluhan terke...,32,246
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus k212 mmbri hujjah partai diw...,12,72
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis jalan sumatra bandung nya nya...,22,132
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia nya unboxing paket barang nya b...,10,66
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,aduh mahasiswa sombong kasih kartu kuning bela...,16,114


In [21]:
train.describe()

Unnamed: 0,total_word,total_char
count,11000.0,11000.0
mean,16.943818,112.631818
std,10.425877,69.268531
min,0.0,0.0
25%,9.0,59.0
50%,15.0,99.0
75%,24.0,160.0
max,78.0,428.0


**Feature Extraction & Build Modelling**

In [25]:
# Separating X (Predictor) and Y (target) variabel
y = train["Sentiment"]
xraw = train["new_clean"]

In [26]:
#Converting text to vector using Sklearn TfidfVectorizer
vec = TfidfVectorizer()
x = vec.fit_transform(xraw)

In [27]:
#covert X to regular array
x = x.toarray()

In [28]:
#Train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [29]:
#Build & training model
model = LogisticRegression(max_iter=11000)
model.fit(x_train, y_train)

In [30]:
#Evaluating our model
score = model.score(x_test, y_test)
print('Akurasi dari prediksi sentimen teks ini mencapai',score*100)

Akurasi dari prediksi sentimen teks ini mencapai 82.68181818181817


In [31]:
test = pd.read_csv("dataset/sample.csv", encoding='latin-1')
test.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,0,0,0,0,0,0,0,0,0,0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,0,0,0,0,0,0,0,0,0,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,0,1,1,0,0,0,0,0,1,0


In [32]:
x_test = vec.transform(test['Tweet'])
prediction = model.predict(x_test.toarray())
prediction

array(['negative', 'negative', 'positive', ..., 'negative', 'negative',
       'negative'], dtype=object)