In [203]:
import spacy
import re
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

In [204]:
df = pd.read_csv("movies_sentiment_data.csv")
df.head()

Unnamed: 0,review,sentiment
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive
1,I enjoyed the movie and the story immensely! I...,positive
2,I had a hard time sitting through this. Every ...,negative
3,It's hard to imagine that anyone could find th...,negative
4,This is one military drama I like a lot! Tom B...,positive


In [205]:
print(df.shape)
print(df["sentiment"].value_counts())

(19000, 2)
sentiment
positive    9500
negative    9500
Name: count, dtype: int64


In [206]:
print(df.iloc[0]["review"])
print(df.iloc[0]["sentiment"])

I first saw Jake Gyllenhaal in Jarhead (2005) a little while back and, since then, I've been watching every one of his movies that arrives on my radar screen. Like Clive Owen, he has an intensity (and he even resembles Owen somewhat) that just oozes from the screen. I feel sure that, if he lands some meaty roles, he'll crack an Oscar one day...<br /><br />That's not to denigrate this film at all.<br /><br />It's a fine story, with very believable people (well, it's based upon the author's early shenanigans with rocketry), a great cast  Chris Cooper is always good, and Laura Dern is always on my watch list  with the appropriate mix of humor, pathos, excitement...and the great sound track with so many rock n roll oldies to get the feet tapping.<br /><br />But, this film had a very special significance for me: in 1957, I was the same age as Homer Hickham; like him, I looked up at the night stars to watch Sputnik as it scudded across the blackness; like Homer also, I experimented with ro

In [207]:
df["sentiment_encoding"] = df["sentiment"].map({
    "negative": 0,
    "positive": 1
})

df.head()

Unnamed: 0,review,sentiment,sentiment_encoding
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1
1,I enjoyed the movie and the story immensely! I...,positive,1
2,I had a hard time sitting through this. Every ...,negative,0
3,It's hard to imagine that anyone could find th...,negative,0
4,This is one military drama I like a lot! Tom B...,positive,1


In [208]:
def preprocess(review):
    doc = nlp(review)

    final_tokens = []
    for token in doc: 
        if token.is_punct or token.is_stop:
            continue 
        final_tokens.append(token.lemma_)  

    final_tokens = ' '.join(str(v) for v in final_tokens)
    final_tokens = re.sub(r"<.*?>", " ", final_tokens)

    return final_tokens

In [209]:
nlp = spacy.load("en_core_web_md")
review = df.iloc[0]["review"]

print(review)
print(preprocess(review))

I first saw Jake Gyllenhaal in Jarhead (2005) a little while back and, since then, I've been watching every one of his movies that arrives on my radar screen. Like Clive Owen, he has an intensity (and he even resembles Owen somewhat) that just oozes from the screen. I feel sure that, if he lands some meaty roles, he'll crack an Oscar one day...<br /><br />That's not to denigrate this film at all.<br /><br />It's a fine story, with very believable people (well, it's based upon the author's early shenanigans with rocketry), a great cast  Chris Cooper is always good, and Laura Dern is always on my watch list  with the appropriate mix of humor, pathos, excitement...and the great sound track with so many rock n roll oldies to get the feet tapping.<br /><br />But, this film had a very special significance for me: in 1957, I was the same age as Homer Hickham; like him, I looked up at the night stars to watch Sputnik as it scudded across the blackness; like Homer also, I experimented with ro

In [210]:
df["preprocessed_review"] = df["review"].apply(lambda x: preprocess(x))

In [211]:
df.head()

Unnamed: 0,review,sentiment,sentiment_encoding,preprocessed_review
0,I first saw Jake Gyllenhaal in Jarhead (2005) ...,positive,1,see Jake Gyllenhaal Jarhead 2005 little watch ...
1,I enjoyed the movie and the story immensely! I...,positive,1,enjoy movie story immensely see original(1939 ...
2,I had a hard time sitting through this. Every ...,negative,0,hard time sit single twist turn predictable si...
3,It's hard to imagine that anyone could find th...,negative,0,hard imagine find short favorite see short kno...
4,This is one military drama I like a lot! Tom B...,positive,1,military drama like lot Tom Berenger play mili...


In [212]:
X_train, X_test, y_train, y_test = train_test_split(
    df["preprocessed_review"],
    df["sentiment"],
    test_size=0.2,
    random_state=2022,
    stratify=df["sentiment"]
)

In [213]:
print(X_train.shape)
print(X_test.shape)

print(y_train.value_counts())
print(y_test.value_counts())

(15200,)
(3800,)
sentiment
positive    7600
negative    7600
Name: count, dtype: int64
sentiment
negative    1900
positive    1900
Name: count, dtype: int64


In [214]:
print(X_train[:4])
print(y_train[:4])

18857    let face perfect production Hamlet simply far ...
15609    83 minute nope thing 72 minute tops.  If guess...
6107     fantasy favorite Ralph Bakshi watch YouTube Se...
2500     walk away movie start scene China finding newb...
Name: preprocessed_review, dtype: object
18857    positive
15609    negative
6107     positive
2500     negative
Name: sentiment, dtype: object


In [215]:
vectorizer = CountVectorizer()

X_train_cv = vectorizer.fit_transform(X_train.values)

In [216]:
dir(vectorizer)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_request_for_signature',
 '_char_ngrams',
 '_char_wb_ngrams',
 '_check_feature_names',
 '_check_n_features',
 '_check_stop_words_consistency',
 '_check_vocabulary',
 '_count_vocab',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_limit_features',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_sort_features',
 '_stop_words_id',
 '_validate_data',
 '_validate_ngram_range',
 '_validate_params',
 '_validate_vocabulary',
 '_warn_for_unused_params',

In [217]:
# len(vectorizer.get_feature_names_out()) == len(vectorizer.vocabulary_)
X_test_cv = vectorizer.transform(X_test.values)

In [218]:
clf = KNeighborsClassifier(n_neighbors=10)

clf.fit(X_train_cv, y_train)

In [219]:
y_pred = clf.predict(X_test_cv)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

    negative       0.73      0.64      0.68      2179
    positive       0.58      0.68      0.63      1621

    accuracy                           0.66      3800
   macro avg       0.66      0.66      0.65      3800
weighted avg       0.67      0.66      0.66      3800



In [220]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train_cv, y_train)

In [221]:
y_pred = clf.predict(X_test_cv)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

    negative       0.87      0.87      0.87      1900
    positive       0.87      0.87      0.87      1900

    accuracy                           0.87      3800
   macro avg       0.87      0.87      0.87      3800
weighted avg       0.87      0.87      0.87      3800



In [222]:
clf = MultinomialNB().fit(X_train_cv, y_train)

In [223]:
y_pred = clf.predict(X_test_cv)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

    negative       0.87      0.83      0.85      2000
    positive       0.82      0.87      0.84      1800

    accuracy                           0.85      3800
   macro avg       0.85      0.85      0.85      3800
weighted avg       0.85      0.85      0.85      3800



In [224]:
# Applying spaCy's NLP model to get vectors for train and test text separately
X_train_vectors = X_train.apply(lambda x: nlp(x).vector)
X_test_vectors = X_test.apply(lambda x: nlp(x).vector)

In [225]:
print(X_train_vectors.shape)
print(X_test_vectors.shape)

print(X_train_vectors[0].shape)

X_train_vectors.head()

(15200,)
(3800,)
(300,)


18857    [-0.6028153, 0.09454226, -0.96966374, -0.57225...
15609    [-0.5622001, -0.18726356, -1.6624453, -1.16319...
6107     [-0.98216444, -0.67598534, -0.7036398, -1.8012...
2500     [-0.12199982, 0.64597774, -2.614633, -1.226270...
11111    [-0.7276113, 0.4852567, -1.518344, -1.4003131,...
Name: preprocessed_review, dtype: object

In [226]:
X_train_vectors = X_train_vectors.values.tolist()
X_train_vectors = np.stack(X_train_vectors)

X_test_vectors = X_test_vectors.values.tolist()
X_test_vectors = np.stack(X_test_vectors)

In [227]:
X_train_vectors

array([[-0.6028153 ,  0.09454226, -0.96966374, ..., -0.69170165,
        -2.0069976 ,  0.54454327],
       [-0.5622001 , -0.18726356, -1.6624453 , ...,  1.1849369 ,
        -1.5802982 ,  0.5344158 ],
       [-0.98216444, -0.67598534, -0.7036398 , ..., -0.11649819,
        -2.02649   ,  0.8447771 ],
       ...,
       [-0.09000638,  1.2551863 , -1.8818523 , ...,  0.8633783 ,
        -2.2340398 ,  0.74805236],
       [-1.3456856 ,  0.21610825, -1.1723868 , ...,  0.17334583,
        -2.3112817 ,  1.1627159 ],
       [-0.35877863,  0.57602006, -2.251586  , ...,  0.73954415,
        -2.373982  , -0.04189541]], dtype=float32)

In [228]:
clf = KNeighborsClassifier(n_neighbors=10)

clf.fit(X_train_vectors, y_train)

In [229]:
y_pred = clf.predict(X_test_vectors)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

    negative       0.84      0.67      0.75      2384
    positive       0.59      0.79      0.67      1416

    accuracy                           0.71      3800
   macro avg       0.71      0.73      0.71      3800
weighted avg       0.75      0.71      0.72      3800



In [230]:
clf = LogisticRegression(random_state=0, max_iter=1000).fit(X_train_vectors, y_train)

In [231]:
y_pred = clf.predict(X_test_vectors)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

    negative       0.82      0.81      0.82      1925
    positive       0.81      0.82      0.82      1875

    accuracy                           0.82      3800
   macro avg       0.82      0.82      0.82      3800
weighted avg       0.82      0.82      0.82      3800



In [232]:
scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train_vectors)

clf = MultinomialNB().fit(X_train_scaled, y_train)

In [233]:
X_test_scaled = scaler.transform(X_test_vectors)

y_pred = clf.predict(X_test_scaled)

print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

    negative       0.68      0.69      0.68      1877
    positive       0.69      0.68      0.69      1923

    accuracy                           0.68      3800
   macro avg       0.68      0.68      0.68      3800
weighted avg       0.68      0.68      0.68      3800

