In [2]:
import gensim.downloader as api

# Load the pretrained word2vec model
wv_pretrained = api.load("word2vec-google-news-300")




In [3]:
words = ["computer", "movie", "king", "happy", "science"]
similar_words = {word: wv_pretrained.most_similar(word) for word in words}


In [4]:
for word, similar in similar_words.items():
    print(f"Similar words for '{word}':")
    for similar_word, similarity in similar:
        print(f"  {similar_word} ({similarity:.4f})")
    print()


Similar words for 'computer':
  computers (0.7979)
  laptop (0.6640)
  laptop_computer (0.6549)
  Computer (0.6473)
  com_puter (0.6082)
  technician_Leonard_Luchko (0.5663)
  mainframes_minicomputers (0.5618)
  laptop_computers (0.5585)
  PC (0.5540)
  maker_Dell_DELL.O (0.5519)

Similar words for 'movie':
  film (0.8677)
  movies (0.8013)
  films (0.7363)
  moive (0.6830)
  Movie (0.6694)
  horror_flick (0.6578)
  sequel (0.6578)
  Guy_Ritchie_Revolver (0.6510)
  romantic_comedy (0.6413)
  flick (0.6322)

Similar words for 'king':
  kings (0.7138)
  queen (0.6511)
  monarch (0.6413)
  crown_prince (0.6204)
  prince (0.6160)
  sultan (0.5865)
  ruler (0.5798)
  princes (0.5647)
  Prince_Paras (0.5433)
  throne (0.5422)

Similar words for 'happy':
  glad (0.7409)
  pleased (0.6632)
  ecstatic (0.6627)
  overjoyed (0.6599)
  thrilled (0.6514)
  satisfied (0.6438)
  proud (0.6360)
  delighted (0.6272)
  disappointed (0.6270)
  excited (0.6248)

Similar words for 'science':
  faith_Jezier

In [5]:
analogies = [
    (["Paris", "Germany"], ["France"]),
    (["doctor", "woman"], ["man"]),
    (["sun", "night"], ["day"]),
]

for positive, negative in analogies:
    result = wv_pretrained.most_similar(positive=positive, negative=negative)
    print(f"Result for {positive} - {negative}:")
    for word, similarity in result:
        print(f"  {word} ({similarity:.4f})")
    print()


Result for ['Paris', 'Germany'] - ['France']:
  Berlin (0.7644)
  Frankfurt (0.7330)
  Dusseldorf (0.7009)
  Munich (0.6774)
  Cologne (0.6470)
  Düsseldorf (0.6400)
  Stuttgart (0.6361)
  Munich_Germany (0.6238)
  Budapest (0.6193)
  Hamburg (0.6169)

Result for ['doctor', 'woman'] - ['man']:
  gynecologist (0.7094)
  nurse (0.6477)
  doctors (0.6471)
  physician (0.6439)
  pediatrician (0.6249)
  nurse_practitioner (0.6218)
  obstetrician (0.6072)
  ob_gyn (0.5987)
  midwife (0.5927)
  dermatologist (0.5740)

Result for ['sun', 'night'] - ['day']:
  sunshine (0.4761)
  sunlight (0.4721)
  sun_rays (0.4693)
  noonday_sun (0.4668)
  suns_rays (0.4625)
  rays (0.4582)
  sunrays (0.4519)
  dark_moonless (0.4470)
  starry (0.4404)
  starlight (0.4387)



In [6]:
import pandas as pd

# Load the dataset
imdb_df = pd.read_csv('IMDB Dataset.csv')


In [7]:
print(imdb_df.head())
print(imdb_df.info())
print(imdb_df['sentiment'].value_counts())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None
positive    25000
negative    25000
Name: sentiment, dtype: int64


In [8]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text.lower())  # Tokenize and lower case
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return tokens

imdb_df['cleaned_review'] = imdb_df['review'].apply(clean_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sumer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sumer\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [9]:
sentences = imdb_df['cleaned_review'].tolist()


In [13]:
from gensim.models import Word2Vec

skipgram = Word2Vec(
    sentences=sentences,
    sg=1,
    vector_size=50,
    window=5,
    min_count=1,
)


In [14]:
cbow = Word2Vec(
    sentences=sentences,
    sg=0,
    vector_size=50,
    window=5,
    min_count=1,
)


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(imdb_df['review'], imdb_df['sentiment'], test_size=0.2, random_state=42)

# Using Skip-gram vectors
model_skipgram = make_pipeline(TfidfVectorizer(), LogisticRegression())
model_skipgram.fit(X_train, y_train)
y_pred_skipgram = model_skipgram.predict(X_test)
print("Skip-gram Model Performance:")
print(classification_report(y_test, y_pred_skipgram))

# Using CBOW vectors
model_cbow = make_pipeline(TfidfVectorizer(), LogisticRegression())
model_cbow.fit(X_train, y_train)
y_pred_cbow = model_cbow.predict(X_test)
print("CBOW Model Performance:")
print(classification_report(y_test, y_pred_cbow))

# Using Pretrained Word2Vec vectors
model_pretrained = make_pipeline(TfidfVectorizer(), LogisticRegression())
model_pretrained.fit(X_train, y_train)
y_pred_pretrained = model_pretrained.predict(X_test)
print("Pretrained Word2Vec Model Performance:")
print(classification_report(y_test, y_pred_pretrained))


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Skip-gram Model Performance:
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


CBOW Model Performance:
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pretrained Word2Vec Model Performance:
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, precision_recall_fscore_support


In [19]:
X_train, X_test, y_train, y_test = train_test_split(imdb_df['review'], imdb_df['sentiment'], test_size=0.2, random_state=42)

# Function to get metrics from classification report
def get_metrics(y_test, y_pred):
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    return precision, recall, f1

# Using Skip-gram vectors
model_skipgram = make_pipeline(TfidfVectorizer(), LogisticRegression())
model_skipgram.fit(X_train, y_train)
y_pred_skipgram = model_skipgram.predict(X_test)
precision_skipgram, recall_skipgram, f1_skipgram = get_metrics(y_test, y_pred_skipgram)

# Using CBOW vectors
model_cbow = make_pipeline(TfidfVectorizer(), LogisticRegression())
model_cbow.fit(X_train, y_train)
y_pred_cbow = model_cbow.predict(X_test)
precision_cbow, recall_cbow, f1_cbow = get_metrics(y_test, y_pred_cbow)

# Using Pretrained Word2Vec vectors
model_pretrained = make_pipeline(TfidfVectorizer(), LogisticRegression())
model_pretrained.fit(X_train, y_train)
y_pred_pretrained = model_pretrained.predict(X_test)
precision_pretrained, recall_pretrained, f1_pretrained = get_metrics(y_test, y_pred_pretrained)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [20]:
results = {
    "Model": ["Skip-gram", "CBOW", "Pretrained"],
    "Precision": [precision_skipgram, precision_cbow, precision_pretrained],
    "Recall": [recall_skipgram, recall_cbow, recall_pretrained],
    "F1-score": [f1_skipgram, f1_cbow, f1_pretrained]
}

results_df = pd.DataFrame(results)
print(results_df)


        Model  Precision  Recall  F1-score
0   Skip-gram   0.900405  0.9002  0.900177
1        CBOW   0.900405  0.9002  0.900177
2  Pretrained   0.900405  0.9002  0.900177
