# IMPORT LIBRARY

In [None]:
import pandas as pd
pd.set_option('max_colwidth',1)
import numpy as np
import seaborn as sns
import nltk
import re
import matplotlib.pyplot as plt

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# IMPORT DATA

In [None]:
data = pd.read_csv('dataset/dataig.csv')
data

In [None]:
data = data[['komentar']]
data

In [None]:
data.shape

In [None]:
data.head(10)

# PREPOCESSING

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
#casefolding
def text_prep(teks):
    teks = str(teks).lower()
    teks = re.sub('@[\w]+','',str(teks))
    teks = re.sub('[^a-zA-Z]','  ', str(teks))
    teks = re.sub(r"\b[a-zA-Z]\b", "", teks)
    teks = re.sub('\s+',' ',teks)
    return teks
data['text_prep'] = data['komentar'].apply(text_prep)
data.head(10)

In [None]:
#tokenize
def text_prep(teks):
    teks = word_tokenize(teks)
    return teks
data['text_prep'] = data['text_prep'].apply(text_prep)
data.head(10)

In [None]:
normalizad_word = pd.read_excel("kamus/normalisasi.xlsx")
normalizad_word_dict = {}
for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1]
        
def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]
data['text_prep'] = data['text_prep'].apply(normalized_term)
data.head(10)

In [None]:
#filtering-stopword
stop_words = set(stopwords.words("indonesian"))
ex_stopword = ['plis','yanb','ke','ku','ko','fyi','nya']
def text_prep(teks):
    teks = [item for item in teks if item not in stopwords.words('indonesian')+ex_stopword]
    return teks
data['text_prep'] = data['text_prep'].apply(text_prep)
data.head(10)

In [None]:
#stemmer sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()
def text_prep(teks):
    teks = str(teks).lower()
    teks = stemmer.stem(teks)
    return teks
data['text_prep'] = data['text_prep'].apply(text_prep)
data.head(10)

In [None]:
data = data[['text_prep']]
data = data.drop_duplicates()
data

In [None]:
data.to_csv(r'C:\Users\dundindan\skripsi\dataset\teks_prep_ig.csv')

# TF-IDF

In [None]:
data = pd.read_csv('dataset/ig_label.csv')
data

In [None]:
#mengubah menjadi numerik
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['sentimen'] = le.fit_transform(data['sentimen'])
data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cv = CountVectorizer()
tv = TfidfVectorizer()

count = cv.fit_transform(data['text_prep'])
tf_v = tv.fit_transform(data['text_prep'])

# SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
x = tf_v
y = data['sentimen']

In [None]:
sm = SMOTE(random_state=42)
X_sampling, Y_sampling = sm.fit_resample(x, y)

In [None]:
print(x.shape)
print(y.shape)

In [None]:
print(X_sampling.shape)
print(Y_sampling.shape)

In [None]:
sns.histplot(data=Y_sampling)
plt.show

# SPLITTING DATA

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_sampling,Y_sampling, test_size=0.50, random_state = 42)
X_test.shape

# MODELLING

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

classifier = MultinomialNB()
classifier.fit(X_train, Y_train)
naive_bayes_pred = classifier.predict(X_test)
print(classification_report(Y_test, naive_bayes_pred))

In [None]:
conma = confusion_matrix(Y_test, naive_bayes_pred)
conma

In [None]:
sns.heatmap(conma, annot=True, cmap='binary')
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()

In [2]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', C = [0.1, 1, 10, 100, 1000] , random_state = 42)
classifier.fit(X_train, Y_train)
svm_pred = classifier.predict(X_test)
# Classification report
print(classification_report(Y_test, svm_pred))

NameError: name 'X_train' is not defined

In [None]:
from sklearn.metrics import mean_squared_error

RMSE = mean_squared_error(Y_test, svm_pred)

print('Root Mean Square Error :', RMSE)

In [None]:
conma2 = confusion_matrix(Y_test, svm_pred)
conma2

In [None]:
sns.heatmap(conma2, annot=True, cmap='binary')
plt.xlabel('Prediksi')
plt.ylabel('Aktual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
#visualisasi

# menampilkan jumlah data untuk setiap score
sns.countplot(x='', data=data)
plt.show()