In [3]:
import os

     

In [4]:
def load_reviews_from_folder(folder_path, label):
    reviews = []
    labels = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                reviews.append(content)
                labels.append(label)
    return reviews, labels

# Base directory (double backslashes or raw string for Windows)
base_path = r"C:/Users/tanma/Downloads/aclImdb"

# Training data paths
train_pos_path = os.path.join(base_path, "train", "pos")
train_neg_path = os.path.join(base_path, "train", "neg")

# Load data
pos_reviews, pos_labels = load_reviews_from_folder(train_pos_path, 1)
neg_reviews, neg_labels = load_reviews_from_folder(train_neg_path, 0)

# Combine both
all_reviews = pos_reviews + neg_reviews
all_labels = pos_labels + neg_labels

print(f"Total reviews loaded: {len(all_reviews)}")
print(f"Sample review: {all_reviews[0][:200]}...")

Total reviews loaded: 25000
Sample review: Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's ...


In [5]:
import re

def clean_text(text):
    text = text.lower()  # 1️⃣ Lowercase everything
    text = re.sub(r"[^\w\s]", "", text)  # 2️⃣ Remove punctuation (keep words and spaces)
    return text


In [6]:
cleaned_reviews = [clean_text(review) for review in all_reviews]

print(f"Total reviews loaded: {len(all_reviews)}")
print(f"Sample review: {cleaned_reviews[0][:200]}...")

Total reviews loaded: 25000
Sample review: bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my 35 years in the teaching profession lead me to believe that bromwell highs satire...


In [7]:
tokenized_reviews = [review.split() for review in cleaned_reviews]


In [8]:
from gensim.models import Word2Vec

sentences = tokenized_reviews
model = Word2Vec(sentences, vector_size=100, window=10, min_count=1)


In [9]:
import numpy as np

# Define function to convert one review (list of words) into vector
def get_review_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# Convert all reviews
X = np.array([get_review_vector(review, model) for review in tokenized_reviews])
y = np.array(all_labels)

In [10]:
wei1 = 0.05*np.random.randn(256 , 100)
bai1 = 0.05*np.random.randn(1,256)
wei2 = 0.05*np.random.randn(1,256)
bai2 = 0.05*np.random.randn(1,1)

In [11]:

y = np.array(all_labels).astype(float).reshape(-1, 1)
def sigmoid(x):
    return np.where(x >= 0,
                    1 / (1 + np.exp(-x)),
                    np.exp(x) / (1 + np.exp(x)))

In [12]:
def loss(X, y, wei1, wei2, bai1, bai2):
    out = np.dot(X,wei1.T)+bai1
    out2 = np.tanh(out)
    out3 = np.dot(out2,wei2.T)+bai2
    out3 = np.clip(out3, -100, 100)  
    out4 = sigmoid(out3)
    out4 = np.clip(out4, 1e-10, 1 - 1e-10)
    l = -np.mean(y*np.log(out4)+(1-y)*np.log(1-out4))
    out5 = (out4-y)
    dwei2 = np.dot(out5.T,out2)
    dbai2 = np.sum(out5,axis= 0 , keepdims=True)
    out6 = ((np.dot(out5,wei2))*(1-out2**2))
    dwei1 = np.dot(out6.T,X)
    dbai1 = np.sum(out6,axis=0,keepdims=True)
    
    return l , dwei1 , dbai1 ,dwei2 , dbai2

In [16]:
for i in range(1000):
    l, dwei1, dbai1, dwei2, dbai2 = loss(X, y, wei1, wei2, bai1, bai2)
    wei1 -= 0.000001 * dwei1
    bai1 -= 0.000001 * dbai1
    wei2 -= 0.000001 * dwei2
    bai2 -= 0.000001 * dbai2
    print(l)

0.4441241391214617
0.44410494653430505
0.44408577722141684
0.44406663114327627
0.4440475082604424
0.4440284085335547
0.44400933192333186
0.44399027839057265
0.4439712478961547
0.44395224040103537
0.4439332558662506
0.4439142942529154
0.4438953555222233
0.4438764396354463
0.4438575465539346
0.44383867623911655
0.4438198286524984
0.4438010037556642
0.44378220151027514
0.44376342187807005
0.443744664820865
0.44372593030055263
0.4437072182791028
0.44368852871856185
0.44366986158105237
0.4436512168287734
0.4436325944239999
0.4436139943290827
0.4435954165064485
0.4435768609185995
0.44355832752811314
0.44353981629764205
0.4435213271899139
0.4435028601677312
0.443484415193971
0.44346599223158506
0.4434475912435991
0.44342921219311326
0.44341085504330136
0.44339251975741134
0.4433742062987643
0.4433559146307551
0.4433376447168517
0.4433193965205953
0.4433011700055999
0.443282965135552
0.4432647818742113
0.4432466201854094
0.44322848003305015
0.4432103613811097
0.44319226419363594
0.443174188434

In [17]:
out = np.dot(X,wei1.T)+bai1
out2 = np.tanh(out)
out3 = np.dot(out2,wei2.T)+bai2
out3 = np.clip(out3, -100, 100)  
out4 = sigmoid(out3)
out4 = np.clip(out4, 1e-10, 1 - 1e-10)
out4[out4>0.5]=1
out4[out4<0.5]=0
np.mean(out4==y)

0.80596

In [18]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)
    return text

def get_review_vector(sentence, model):
    tokens = sentence.split()
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

def predict_review(text, model, wei1, wei2, bai1, bai2):
    text = clean_text(text)
    vector = get_review_vector(text, model)
    vector = (vector - np.mean(X, axis=0)) / (np.std(X, axis=0) + 1e-10)
    vector = np.nan_to_num(vector).reshape(1, -1)

    # Forward pass
    out = np.dot(vector, wei1.T) + bai1
    out2 = np.tanh(out)
    out3 = np.dot(out2, wei2.T) + bai2
    out3 = np.clip(out3, -100, 100)
    out4 = 1 / (1 + np.exp(-out3))

    return "Positive 😊" if out4[0][0] > 0.5 else "Negative 😞"


In [19]:
my_review = "The trailer really had me excited for this movie.But once I actually watched it, the excitement vanished. The plot made no sense, characters were flat and forgettable, and the pacing was painfully slow. The twists were predictable, and the ending felt rushed and unsatisfying. Honestly, I kept checking the time, waiting for it to be over"
result = predict_review(my_review, model, wei1, wei2, bai1, bai2)
print("Prediction:", result)



Prediction: Negative 😞


In [20]:
np.savez("model_params.npz", wei1=wei1, wei2=wei2, bai1=bai1, bai2=bai2, X=X)


In [21]:
model

<gensim.models.word2vec.Word2Vec at 0x1967fc94320>

In [22]:
model.save("word2vec_review.model")  