In [6]:
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nlp_id.lemmatizer import Lemmatizer
import pickle
import sqlite3

# Mengatur opsi pandas untuk menampilkan seluruh konten kolom
pd.set_option('display.max_colwidth', None)

# Membaca data dari file CSV
df = pd.read_csv('train_preprocess.tsv.txt',sep='\t', error_bad_lines=False, header=None)
df.rename(columns={0:'kalimat',1:'sentimen'}, inplace = True)

def cleansing(text):
    text = text.lower()
    text = re.sub('\w+[0-9]\w+',' ',text)
    text = re.sub('__\w+__',' ',text)
    text = re.sub(r'[^A-Za-z0-9]',' ',text)
    text = re.sub('( ){2,10}',' ',text)
    return text

# Membersihkan teks
df['text_clean'] = df.kalimat.apply(cleansing)

conn = sqlite3.connect('database_hate.db')
call_alay = pd.read_sql_query('SELECT * FROM kamus_alay',conn)

alay = dict(zip(call_alay['kata_alay'],call_alay['kata_normal']))

#fungsi untuk mengganti kata alay ke normal
def normalize(text):
    hasil = []
    splitting = text.split(' ')
    for kata in splitting:
        if kata in alay:
            hasil.append(alay[kata])
        else:
            hasil.append(kata)
    
    return ' '.join(hasil)

df['normal'] = df.text_clean.apply(normalize)


# Menginisialisasi lemmatizer
lema = Lemmatizer()

# def lemas(texts):
#     preprocess = [lema.lemmatize(text) for text in texts]
#     return preprocess


def lemas(text):
    text = lema.lemmatize(text)
    return text

df['lemas'] = df.normal.apply(lemas)
# Lematisasi teks
#texts = df.normal.tolist()
#df['lemas'] = lemas(texts)

# Inisialisasi vektorisasi teks
vect = TfidfVectorizer()
fitur = vect.fit_transform(df.lemas)

kata = vect.get_feature_names_out()

# Membentuk dataframe dengan matriks fitur
matrix = pd.DataFrame(fitur.toarray(), columns=kata)
#pickle.dump(vect,open('fitur_5.p','wb'))

# Memisahkan data menjadi data latih dan data uji
target = df.sentimen
X_train, X_test, y_train, y_test = train_test_split(fitur, target, test_size=0.25, random_state=42)

# Melatih model
model = MLPClassifier(random_state = 42)
model.fit(X_train, y_train)

#pickle.dump(model.fit(X_train, y_train),open('model_fit.p','wb'))
print('TRAINING SELESAI')


# Mengevaluasi model
test = model.predict(X_test)
print("Testing selesai")
print(classification_report(y_test, test))
#pickle.dump(model,open('model_5.p','wb'))
# Melakukan prediksi pada contoh teks baru
contoh = "kamu keren banget!"
contoh1 = vect.transform([contoh])
result = model.predict(contoh1)[0]
print("Sentimen:")
print(result)




  df = pd.read_csv('train_preprocess.tsv.txt',sep='\t', error_bad_lines=False, header=None)


TRAINING SELESAI
Testing selesai
              precision    recall  f1-score   support

    negative       0.78      0.79      0.78       875
     neutral       0.79      0.66      0.72       287
    positive       0.88      0.90      0.89      1588

    accuracy                           0.84      2750
   macro avg       0.81      0.78      0.80      2750
weighted avg       0.84      0.84      0.84      2750

Sentimen:
positive


In [8]:
pickle.dump(vect,open('fitur_fix.p','wb'))
pickle.dump(model,open('model_fix.p','wb'))