In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
df = pd.read_csv('movie-review-dataset.csv')
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment,label
0,"I absolutely loved this movie, the acting was ...",positive,2
1,The plot was weak and predictable.,negative,0
2,"It was an okay film, nothing too memorable.",neutral,1
3,Fantastic cinematography and great soundtrack!,positive,2
4,The movie was way too long and boring.,negative,0


In [30]:
text = df['review'].values
target = df['label'].values

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def spacy_tokenizer(text):

    doc = nlp(text)
    tokens = [
        token.lemma_.lower() for token in doc
        if (token.is_alpha and not token.like_num and not token.is_punct and not token.is_space)
    ]
    return tokens

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vectorizer = CountVectorizer(
    stop_words=None,
    ngram_range=(1, 2),
    tokenizer=spacy_tokenizer
)
count_matrix = count_vectorizer.fit_transform(text).toarray()
count_vocab = count_vectorizer.get_feature_names_out()

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

X_train, X_test, y_train, y_test = train_test_split(count_matrix, target, test_size=0.2, random_state=42, stratify=target, shuffle=True)

In [40]:
logistic_bow_model = LogisticRegression()
logistic_bow_model.fit(X_train, y_train)

bow_lg_pred = logistic_bow_model.predict(X_test)

print(f"Accuracy Score : {accuracy_score(y_test, bow_lg_pred)}")
print(f"\nConfusion Matrix : \n{confusion_matrix(y_test, bow_lg_pred)}")
print(f"\nClassification Report : \n{classification_report(y_test, bow_lg_pred)}")

In [43]:
rfc_bow_model = RandomForestClassifier()
rfc_bow_model.fit(X_train, y_train)

bow_rfc_pred = rfc_bow_model.predict(X_test)

print(f"Accuracy Score : {accuracy_score(y_test, bow_rfc_pred)}")
print(f"\nConfusion Matrix : \n{confusion_matrix(y_test, bow_rfc_pred)}")
print(f"\nClassification Report : \n{classification_report(y_test, bow_rfc_pred)}")

In [46]:
tdf_vectorizer = TfidfVectorizer(
    stop_words=None,
    ngram_range=(1, 2),
    tokenizer=spacy_tokenizer
)

tdf_matrix = tdf_vectorizer.fit_transform(text).toarray()
tdf_vocab = tdf_vectorizer.get_feature_names_out()

In [47]:
A_train, A_test, b_train, b_test = train_test_split(tdf_matrix, target, test_size=0.2, random_state=42, stratify=target, shuffle=True)

In [48]:
logistic_tdf_model = LogisticRegression()
logistic_tdf_model.fit(A_train, b_train)

tdf_lg_pred = logistic_tdf_model.predict(A_test)

print(f"Accuracy Score : {accuracy_score(b_test, tdf_lg_pred)}")
print(f"\nConfusion Matrix : \n{confusion_matrix(b_test, tdf_lg_pred)}")
print(f"\nClassification Report : \n{classification_report(b_test, tdf_lg_pred)}")

Accuracy Score : 0.7

Confusion Matrix : 
[[2 1 1]
 [0 3 0]
 [0 1 2]]

Classification Report : 
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         4
           1       0.60      1.00      0.75         3
           2       0.67      0.67      0.67         3

    accuracy                           0.70        10
   macro avg       0.76      0.72      0.69        10
weighted avg       0.78      0.70      0.69        10



In [49]:
rfc_tdf_model = RandomForestClassifier()
rfc_tdf_model.fit(A_train, b_train)

tdf_rfc_pred = rfc_tdf_model.predict(A_test)

print(f"Accuracy Score : {accuracy_score(b_test, tdf_rfc_pred)}")
print(f"\nConfusion Matrix : \n{confusion_matrix(b_test, tdf_rfc_pred)}")
print(f"\nClassification Report : \n{classification_report(b_test, tdf_rfc_pred)}")

Accuracy Score : 0.7

Confusion Matrix : 
[[2 1 1]
 [0 3 0]
 [1 0 2]]

Classification Report : 
              precision    recall  f1-score   support

           0       0.67      0.50      0.57         4
           1       0.75      1.00      0.86         3
           2       0.67      0.67      0.67         3

    accuracy                           0.70        10
   macro avg       0.69      0.72      0.70        10
weighted avg       0.69      0.70      0.69        10

