In [1]:
# !pip install Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

def remove_stopwords(text):
    factory = StopWordRemoverFactory()
    stopwords = factory.get_stop_words()

    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords]
    filtered_text = ' '.join(filtered_words)

    return filtered_text



Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m204.8/209.7 kB[0m [31m7.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [2]:
import re
import pandas as pd

def fix_word(text):
    return ' '.join([kamus_dict[word] if word in kamus_dict else word for word in text.split(' ')])

def remove_unnecessaryChar(text):
     text = re.sub(r'&amp;|amp;|&', 'dan', text)
     text = re.sub(r'\\n+', '', text)
     text = re.sub('&lt;/?[a-z]+&gt;', ' ', text)
     text = re.sub(r'#+','#', text)
     text = re.sub(r'http\S+',' ',text)
     text = re.sub(r'(USER+\s?|RT+\s?|URL+\s?)', ' ', text)
     text = re.sub(r'x[a-zA-Z0-9]+', ' ', text)
     return text

def remove_punctuation(text):
     text = re.sub(r'\?', '', text)
     text = re.sub(r'[^a-zA-Z0-9]+', ' ', text)
     text = re.sub(r' +', ' ', text.lower().lstrip("0123456789").strip())
     return text

def preprocessing(text):
     text = remove_unnecessaryChar(text)
     text = remove_punctuation(text)
     text = fix_word(text)
     return text

In [4]:
df = pd.read_csv('train_preprocess.tsv.txt', encoding='ISO-8859-1', delimiter="\t", names=['text','sentiment'])
df["text"] = df["text"].str.encode('ascii', 'ignore').str.decode('ascii')
df.drop_duplicates()
# print(df.head(30))
kamus = pd.read_csv('new_kamusalay.csv', names=['old','new'], encoding='ISO-8859-1')
kamus_dict = dict(zip(kamus['old'], kamus['new']))

df["text"] = df["text"].apply(remove_stopwords2)# remove stopwords
df["text"] = df["text"].apply(preprocessing)# apply cleansing

df.replace('', pd.NA, inplace=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,text,sentiment
0,warung dimiliki pengusaha pabrik tahu puluhan ...,positive
1,mohon ulama lurus k212 mmbri hujjah partai apa...,neutral
2,lokasi strategis jalan sumatra bandung tempat ...,positive
3,betapa bahagia nya diri unbo paket barang nya ...,positive
4,aduh jadi mahasiswa jangan sombong dong kasih ...,negative


In [5]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC

from sklearn.preprocessing import LabelEncoder

# endocde sentiment
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

# split dataset
X = df['text']
y = df['sentiment_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# feature extraction
vectorizer = CountVectorizer()  # Bag of Words
# vectorizer = TfidfVectorizer()  # TF-IDF
X_train_features = vectorizer.fit_transform(X_train)
X_test_features = vectorizer.transform(X_test)

# NN classifier
classifier = MLPClassifier(hidden_layer_sizes=(37,), solver="adam", activation="relu", early_stopping=True, max_iter=2000, random_state=27)

# Define the parameter grid for GridSearchCV
# param_grid = {
#     'hidden_layer_sizes': [(10), (25), (37), (50,),(75), (100,), (150,)],
#     'activation': ['relu', 'tanh', 'logistic', 'identity'],
#     'solver': ['adam', 'sgd', 'lbfgs']
# }

# Create the MLPClassifier model
# classifier = MLPClassifier(random_state=27, max_iter=2000)

# Perform grid search
# grid_search = GridSearchCV(classifier, param_grid, cv=5)
# grid_search.fit(X_train_features, y_train)

# Print the best parameters and best score
# print("Best Parameters:", grid_search.best_params_)
# print("Best Score:", grid_search.best_score_)


# Cross-validation
# cv_scores = cross_val_score(classifier, X_train_features, y_train, cv=5)

# print("Cross-validation scores:", cv_scores)
# print("Mean cross-validation score:", cv_scores.mean())

# Train
classifier.fit(X_train_features, y_train)



In [6]:
# evaluation
y_pred = classifier.predict(X_test_features)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=le.classes_)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)


Accuracy: 0.8759090909090909
Classification Report:
              precision    recall  f1-score   support

    negative       0.81      0.85      0.83       678
     neutral       0.88      0.72      0.79       236
    positive       0.91      0.92      0.91      1286

    accuracy                           0.88      2200
   macro avg       0.87      0.83      0.85      2200
weighted avg       0.88      0.88      0.88      2200



In [8]:
new_texts = ["Aku suka banget dengan movie tadi malam", "Pertandingan kemarin rusuh banget"]
new_texts_features = vectorizer.transform(new_texts)
new_sentiments_encoded = classifier.predict(new_texts_features)
print(new_sentiments_encoded)
new_sentiments = le.inverse_transform(new_sentiments_encoded)
print("Predicted Sentiments:", new_sentiments)


[2 0]
Predicted Sentiments: ['positive' 'negative']


In [None]:

import pickle

# Save the trained model, countVectorizer, label encoder to a file
with open('modelnn.pkl', 'wb') as file:
    pickle.dump(classifier, file)

with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

with open('labelencoder.pkl', 'wb') as file:
    pickle.dump(le, file)


In [None]:
alay_df = pd.read_csv("new_kamusalay.csv", usecols=[0, 1], names=[
                      'alay', 'baku'], header=None, encoding='latin-1')

def clean_text(sentence):
    words = sentence.split()
    clean_words = []
    for word in words:
        if word in alay_df["alay"].tolist():
            standard_word = alay_df.loc[alay_df["alay"]
                                        == word, "baku"].iloc[0]
            clean_words.append(standard_word)
        else:
            clean_words.append(word)

    clean_sentence = " ".join(clean_words)
    return clean_sentence



In [None]:
import pandas as pd

# Load the data from the CSV file
data = pd.read_csv('data.csv',encoding="latin-1")

# Preprocess the data
data["Tweet"] = data["Tweet"].apply(remove_stopwords2)# remove stopwords
data["Tweet"] = data["Tweet"].apply(preprocessing)# apply cleansing
data["Tweet"] = data["Tweet"].apply(clean_text)# apply cleansing alay

X = data['Tweet'].astype(str)

# Transform the data using the trained vectorizer
X_test_vec = vectorizer.transform(X)

# Predict the sentiment for the "tweet" column
predictions = classifier.predict(X_test_vec)
predictions = le.inverse_transform(predictions)

# Create a new DataFrame with the original data and the predictions
output = pd.DataFrame({'tweet': X, 'sentiment': predictions})

# Save the DataFrame to a new CSV file
# output.to_csv('predictions.csv', index=False)
print(output)


                                                   tweet sentiment
0      di saat semua cowok berusaha melacak perhatian...  negative
1      siapa telat memberi tau eluedan sarap gue berg...  negative
2      kadang aku berpikir aku tetap percaya tuhan pa...  positive
3            aku akuku tau matamu sipit dilihat mana aku  negative
4      kaum cebong kafir sudah kelihatan dongoknya aw...  negative
...                                                  ...       ...
13164  jangan asal berbicara ndasmu congor kamu yang ...  negative
13165                             kasur mana enak kunyuk  negative
13166                    hati hati bisu glagi bosan aduh  negative
13167  bom real mudah terdeteksi bom terkubur suatu l...   neutral
13168             mana situ memberi cuma foto kutil onta  negative

[13169 rows x 2 columns]
