In [11]:
from gensim.models import Word2Vec
import numpy as np
import re

In [17]:
# ----------------------
# Dataset
# ----------------------
from pathlib import Path
import csv

data_path = Path('data/dataset.csv')
max_samples = 50000

sentences = []
if data_path.exists():
    with data_path.open(newline='', encoding='utf-8') as f:
        reader = csv.DictReader(f)
        for i, row in enumerate(reader):
            if i >= max_samples:
                break
            text = (row.get('review_text') or '').strip()
            score = row.get('review_score')
            if not text or score is None:
                continue
            label = 1 if str(score).strip() == '1' else 0
            sentences.append((text, label))
else:
    sentences = [
        ("The movie was amazing and full of heart", 1),
        ("A boring plot with terrible acting", 0),
        ("I loved the characters but hated the ending", 0),
        ("The film was not good at all", 0),
        ("Surprisingly fun and well written", 1),
        ("I expected more it was disappointing", 0),
        ("Absolutely fantastic experience", 1),
        ("The story was dull and predictable", 0)
    ]


In [18]:
# ----------------------
# Preprocessing
# ----------------------
def tokenize(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    words = text.split()
    tokens = []
    i = 0
    while i < len(words):
        w = words[i]
        if w in {"not", "no", "never"} and i + 1 < len(words):
            tokens.append(f"{w}_{words[i + 1]}")
            i += 2
            continue
        tokens.append(w)
        i += 1
    return tokens


In [19]:
corpus = [tokenize(text) for text, _ in sentences]
print('num sentences:', len(corpus))
print('sample:', corpus[:3])


num sentences: 49861
sample: [['ruined', 'my', 'life'], ['this', 'will', 'be', 'more', 'of', 'a', 'my', 'experience', 'with', 'this', 'game', 'type', 'of', 'review', 'because', 'saying', 'things', 'like', 'great', 'gameplay', 'will', 'not_suit', 'something', 'ive', 'experienced', 'with', 'counterstrike', 'here', 'you', 'go', 'i', 'remember', 'back', 'in', 'i', 'was', 'at', 'a', 'friends', 'house', 'and', 'he', 'was', 'playing', 'a', 'game', 'i', 'didnt', 'know', 'the', 'name', 'of', 'the', 'game', 'nor', 'i', 'had', 'internet', 'to', 'find', 'it', 'a', 'few', 'weeks', 'passed', 'by', 'and', 'another', 'friend', 'came', 'over', 'he', 'didnt', 'have', 'a', 'computer', 'so', 'he', 'brought', 'a', 'disc', 'with', 'a', 'game', 'in', 'it', 'he', 'told', 'me', 'that', 'it', 'was', 'one', 'of', 'the', 'best', 'games', 'and', 'from', 'that', 'very', 'moment', 'i', 'knew', 'that', 'it', 'is', 'going', 'to', 'be', 'the', 'game', 'i', 'saw', 'at', 'the', 'other', 'friends', 'house', 'when', 'i', '

In [20]:
# ----------------------
# Train Word2Vec
# ----------------------
model = Word2Vec(corpus, vector_size=50, window=4, min_count=1, sg=1)

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [21]:
# ----------------------
# Sentence vector
# ----------------------
def sentence_vector(tokens):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    if not vecs:
        return np.zeros(model.vector_size)
    return np.mean(vecs, axis=0)

In [22]:
X = np.array([sentence_vector(tokens) for tokens in corpus])
y = np.array([label for _, label in sentences])

In [23]:
# ----------------------
# Simple sentiment prototypes
# ----------------------
pos_vec = np.mean(X[y == 1], axis=0)
neg_vec = np.mean(X[y == 0], axis=0)

In [24]:
def cosine(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def predict(sentence):
    v = sentence_vector(tokenize(sentence))
    return "positive" if cosine(v, pos_vec) > cosine(v, neg_vec) else "negative"

In [25]:
# ----------------------
# Try it
# ----------------------
tests = [
    "great acting and wonderful story",
    "painfully slow and boring",
    "not bad but not great",
    "I loved the visuals"
]

for t in tests:
    print(t, "→", predict(t))

great acting and wonderful story → positive
painfully slow and boring → negative
not bad but not great → positive
I loved the visuals → positive
