# Projek Kelompok Information Retrieval
# Topik: Review Categorization

# Group:
Alvita Hartanti Wijaya (2201829640)

Jonathan Herbert (2201786566)

Putra Aji Lintang Kusuma (2201785292)

Tatyana Mitya Wahyu Andini (2201831651)

Yogga Putra Rachmadi (2201819210)

# Reference:
https://gist.github.com/kunalj101/ad1d9c58d338e20d09ff26bcc06c4235

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html

https://blog.exsilio.com/all/accuracy-precision-recall-f1-score-interpretation-of-performance-measures/

https://www.wonderflow.ai/blog/sentiment-analysis-examples

https://arxiv.org/ftp/arxiv/papers/1610/1610.09982.pdf

https://www.researchgate.net/publication/261497806_Sentiment_analysis_of_Facebook_statuses_using_Naive_Bayes_Classifier_for_language_learning

# Import the libraries

In [10]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import precision_recall_fscore_support, classification_report

import pandas, numpy, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

# Prepare the Dataset

In [2]:
# load the dataset
data = open('D:\corpus', encoding="utf8").read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
    content = line.split()
    labels.append(content[0])
    texts.append(" ".join(content[1:]))

# create a dataframe using texts and lables
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

trainDF.describe()

Unnamed: 0,text,label
count,10000,10000
unique,10000,2
top,Not worth your money: This game is ridiculousl...,__label__1
freq,1,5097


# Split data to train and test set

In [3]:
# split the dataset into training and validation datasets 
train_x, test_x, train_y, test_y = model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

# Feature Engineering

# We will use TF-IDF Vectors as features

In [4]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xtest_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)


# Model Building

In [5]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, test_y)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xtest_tfidf_ngram)
print("Accuracy: ", accuracy)

Accuracy:  0.84


In [11]:
testing = naive_bayes.MultinomialNB().fit(xtrain_tfidf_ngram, train_y)
predicted = testing.predict(xtest_tfidf_ngram)
print("Accuracy:",metrics.accuracy_score(test_y, predicted))
print("")
count = 0
for i, j in zip(predicted,test_y):
    if i == j:
        count = count + 1
    else:
        continue

print("Tes 10 data")
for i in range(10):
    print("TEST KE",i+1)
    print("Test:", test_y[i])
    print("Actual:", predicted[i])
    print("")


print("Dari total prediksi sebanyak",len(predicted))
print("Jumlah betul sebanyak",count)
percentage = (count/len(predicted))
print("Accuracy percentage:", percentage)
target_names = ['0','1']
print("")
print(classification_report(test_y, predicted, target_names=target_names))

Accuracy: 0.84

Tes 10 data
TEST KE 1
Test: 0
Actual: 0

TEST KE 2
Test: 0
Actual: 0

TEST KE 3
Test: 0
Actual: 0

TEST KE 4
Test: 0
Actual: 0

TEST KE 5
Test: 0
Actual: 0

TEST KE 6
Test: 1
Actual: 1

TEST KE 7
Test: 0
Actual: 0

TEST KE 8
Test: 0
Actual: 0

TEST KE 9
Test: 0
Actual: 0

TEST KE 10
Test: 1
Actual: 1

Dari total prediksi sebanyak 2500
Jumlah betul sebanyak 2100
Accuracy percentage: 0.84

              precision    recall  f1-score   support

           0       0.84      0.85      0.85      1290
           1       0.84      0.83      0.83      1210

    accuracy                           0.84      2500
   macro avg       0.84      0.84      0.84      2500
weighted avg       0.84      0.84      0.84      2500

