In [38]:
import pandas as pd
import numpy as np
import string
import re

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
df = pd.read_csv('./datasets/sentiment_analysis.csv')
df.head()

Unnamed: 0,Year,Month,Day,Time of Tweet,text,sentiment,Platform
0,2018,8,18,morning,What a great day!!! Looks like dream.,positive,Twitter
1,2018,8,18,noon,"I feel sorry, I miss you here in the sea beach",positive,Facebook
2,2017,8,18,night,Don't angry me,negative,Facebook
3,2022,6,8,morning,We attend in the class just for listening teac...,negative,Facebook
4,2022,6,8,noon,"Those who want to go, let them go",negative,Instagram


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 499 entries, 0 to 498
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Year           499 non-null    int64 
 1   Month          499 non-null    int64 
 2   Day            499 non-null    int64 
 3   Time of Tweet  499 non-null    object
 4   text           499 non-null    object
 5   sentiment      499 non-null    object
 6   Platform       499 non-null    object
dtypes: int64(3), object(4)
memory usage: 27.4+ KB


In [9]:
data = df.loc[:,("text", "sentiment")]
data.head()

Unnamed: 0,text,sentiment
0,What a great day!!! Looks like dream.,positive
1,"I feel sorry, I miss you here in the sea beach",positive
2,Don't angry me,negative
3,We attend in the class just for listening teac...,negative
4,"Those who want to go, let them go",negative


In [45]:
# preprocessing functions
stop_words = set(stopwords.words('english'))

def text_preprocessing(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r"https?://\S+|www\.\S+", '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\n', '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    return text

def drop_stopwords(text):
    dropped = [word for word in text.split() if word not in stop_words]
    final_text = ' '.join(dropped)
    return final_text


# applying preprocessing functions
data_processed = data.copy()
data_processed['text'] = data_processed['text'].apply(text_preprocessing).apply(drop_stopwords)

data_processed.head()


# label encoding
LE = LabelEncoder()
data_processed['Encoded sentiment'] = LE.fit_transform(data_processed['sentiment'])

In [46]:
data_processed.head()

Unnamed: 0,text,sentiment,Encoded sentiment
0,great day looks like dream,positive,2
1,feel sorry miss sea beach,positive,2
2,dont angry,negative,0
3,attend class listening teachers reading slide ...,negative,0
4,want go let go,negative,0


In [47]:
# Menggunakan teks yang telah diproses dan label encoded sebagai fitur dan label
X = data_processed['text']
y = data_processed['Encoded sentiment']

# Membagi data menjadi training set dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# Misalkan X_train berisi data teks, lakukan konversi menggunakan TF-IDF
vectorizer = TfidfVectorizer()
X_train_transformed = vectorizer.fit_transform(X_train)

# Latih model KNN dengan data yang telah dikonversi
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_transformed, y_train)


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Jika Anda telah menggunakan TF-IDF untuk data training, Anda harus menerapkannya pada data testing juga
# Asumsikan 'vectorizer' adalah instance dari TfidfVectorizer yang sudah di-fit sebelumnya pada data training
X_test_transformed = vectorizer.transform(X_test)

# Lakukan prediksi pada data testing yang telah dikonversi
y_pred = knn.predict(X_test_transformed)


In [52]:
# Lakukan evaluasi model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1 Score:', f1)
print('Confusion Matrix:\n', cm)

Accuracy: 0.32
Precision: 0.7918367346938776
Recall: 0.32
F1 Score: 0.17951303088803092
Confusion Matrix:
 [[ 1 35  0]
 [ 0 30  0]
 [ 0 33  1]]
