In [1]:
#Import settings
import pandas as pd
from sklearn.model_selection import cross_val_score, train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import os


In [2]:
#Check the path
path = os.getcwd()

In [3]:
# Load datasets
train_df = pd.read_csv(path+'/data/train.csv')

# Replace NA values in TEXT into "blank"
X = train_df['TEXT'].fillna("blank").astype(str)
y = train_df['LABEL']

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
'''
Initialize vectorizers
TfidfVectorizer: ngram_range : upto trigram, lowercase
CountVectorizer was also tested, but due to low performance, it was removed
'''

tfidf_vect = TfidfVectorizer(ngram_range= (1,3),lowercase = True)

# Fit and transform the train data
X_train_tfidf = tfidf_vect.fit_transform(X_train)
# Transform the test data
X_test_tfidf = tfidf_vect.transform(X_val)


# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Fit LabelEncoder on the labels
label_encoder.fit(y_train)


# Transform labels to numeric values
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_val)


In [5]:
'''
Create a class called Classifiers:

Three machine learning algorithms will be tested
    1. Logistic regression
    2. Multinomial naive_bayes model
    3. Liner Support vector classifier

Each model's hyper parameter was chosen after preliminary analysis with sklearn.gridsearchCV
'''

class Classifiers:
    def __init__(self):
        self.logistic = LogisticRegression(max_iter = 1000, C = 3) 
        self.bayes = MultinomialNB()
        self.svm = LinearSVC(max_iter = 1000, C = 3)

    def fit(self, features, labels):
        self.logistic.fit(features,labels)
        self.bayes.fit(features,labels)
        self.svm.fit(features,labels)


    def predict(self, features):
        return self.logistic.predict(features), self.bayes.predict(features), self.svm.predict(features)

In [6]:
#Initialize classifier
clf = Classifiers()

#Fit the vectorized features and labels into each algorithm
clf.fit(X_train_tfidf, y_train_encoded)

#Predictions
logistics, bayes,svm = clf.predict(X_test_tfidf)

In [7]:
#Print each algorithm's classification report

print("Logistic Regressions:", '=' * 50)
print(classification_report(y_val, logistics, target_names = ['Not a movie', "Positive review", "Negative review"]))

print("Naive Bayes Model:", '=' * 50)
print(classification_report(y_val, bayes, target_names = ['Not a movie', "Positive review", "Negative review"]))

print("Linear SVC:", '=' * 50)
print(classification_report(y_val, svm, target_names = ['Not a movie', "Positive review", "Negative review"]))

                 precision    recall  f1-score   support

    Not a movie       0.97      0.97      0.97      6454
Positive review       0.87      0.90      0.88      3856
Negative review       0.91      0.88      0.89      3754

       accuracy                           0.93     14064
      macro avg       0.92      0.91      0.91     14064
   weighted avg       0.93      0.93      0.93     14064

                 precision    recall  f1-score   support

    Not a movie       0.97      0.94      0.96      6454
Positive review       0.86      0.85      0.85      3856
Negative review       0.85      0.91      0.88      3754

       accuracy                           0.91     14064
      macro avg       0.89      0.90      0.90     14064
   weighted avg       0.91      0.91      0.91     14064

                 precision    recall  f1-score   support

    Not a movie       0.98      0.98      0.98      6454
Positive review       0.88      0.90      0.89      3856
Negative review       0.

In [8]:
#Load test dataset
test_df = pd.read_csv(path + '/data/test.csv')

#Fill NA values in TEXT to "blank"
test_df = test_df.fillna('blank')

#Transform the TEXT into tfidf vectorizer
test_tfidf = tfidf_vect.transform(test_df['TEXT'])

#Prediction based on the best performing model (LinearSVC)
_,_,predictions = clf.predict(test_tfidf)

In [9]:
#Create a data frame for the submission of the prediction columns: ID, LABEL
submission = pd.DataFrame(columns=['ID','LABEL'])
submission['ID'] = test_df['ID']
submission['LABEL'] = predictions

#Check the dataframe
submission.head()

Unnamed: 0,ID,LABEL
0,4728459160322025755,1
1,1840432070229003467,1
2,12623336783082722606,2
3,7446733850828603409,0
4,16180660281866613068,2


In [10]:
#Save as a .csv file
submission.to_csv(path + '/submission_v6.csv', index = False)