In [11]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
# import nltk
# nltk.download('punkt_tab')
# nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yash\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [28]:
import pandas as pd

df = pd.read_csv('data/IMDB Dataset.csv')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [30]:
import string
from nltk.corpus import stopwords
from tqdm import tqdm

tqdm.pandas(desc="Removing punctuation and converting to lowercase")
df['text'] = df['review'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower())

tqdm.pandas(desc="Removing stopwords")
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in set(stopwords.words('english'))]))

In [31]:
tokens = [word_tokenize(review) for review in df['review']]

In [32]:
# Train a skipgram model
skipgram = Word2Vec(
    sentences=tokens, # list of list of tokens
    sg=1,
    vector_size=50,
    window=5,
    min_count=1,
)

In [35]:
def get_sentence_embeddings(sentence, model):
    embeddings = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(model.vector_size)

tqdm.pandas(desc="Skipgram embeddings")
df['skipgram_embeddings'] = df['text'].progress_apply(lambda x: get_sentence_embeddings(x, model=skipgram))

Skipgram embeddings: 100%|██████████| 50000/50000 [00:50<00:00, 992.39it/s] 


In [37]:
from sklearn.model_selection import train_test_split
X = np.stack(df['skipgram_embeddings'].values)
y = df['sentiment'].map({'positive': 1, 'negative': 0}).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.6006
              precision    recall  f1-score   support

           0       0.60      0.59      0.60      4961
           1       0.60      0.61      0.60      5039

    accuracy                           0.60     10000
   macro avg       0.60      0.60      0.60     10000
weighted avg       0.60      0.60      0.60     10000



In [40]:
# Train a CBOW model
cbow = Word2Vec(
    sentences=tokens, # list of list of tokens
    sg=0,
    vector_size=50,
    window=5,
    min_count=1,
)

In [41]:
tqdm.pandas(desc="CBOW embeddings")
df['cbow_embeddings'] = df['text'].progress_apply(lambda x: get_sentence_embeddings(x, model=cbow))

CBOW embeddings: 100%|██████████| 50000/50000 [00:48<00:00, 1028.58it/s]


In [42]:
X = np.stack(df['cbow_embeddings'].values)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [43]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.6072
              precision    recall  f1-score   support

           0       0.61      0.60      0.60      4961
           1       0.61      0.62      0.61      5039

    accuracy                           0.61     10000
   macro avg       0.61      0.61      0.61     10000
weighted avg       0.61      0.61      0.61     10000

