<a href="https://colab.research.google.com/github/tugsukage/SentimentBert/blob/main/LogReg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Core libraries
import os
import re
import string
import numpy as np
import pandas as pd

# Scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize

# Gensim
!pip install gensim
import gensim
from gensim.models import Word2Vec
import gensim.downloader as api

# Transformers / sentence embeddings
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModel
import torch

# Optional: stopwords
try:
    import nltk
    nltk.download('stopwords', quiet=True)
    from nltk.corpus import stopwords
    EN_STOPWORDS = set(stopwords.words('english'))
except Exception:
    EN_STOPWORDS = set()

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m29.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0




The `FileNotFoundError` suggests that the dataset is not available at the specified `DATA_DIR`. Assuming this is the `aclImdb` dataset, we will download and extract it to the Colab environment. The original path `'/Users/tugs-erdene/Desktop/ЭХБ/models/aclImdb'` seems to be a local path on your machine. We will update `DATA_DIR` to point to the newly downloaded dataset location.

In [None]:
# Download the aclImdb dataset
!wget -nc http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

--2025-12-16 15:46:37--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2025-12-16 15:46:39 (38.1 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
DATA_DIR = "/content/aclImdb"

def read_reviews(base_dir):
    def read_folder(path, label):
        texts = []
        labels = []
        for fname in os.listdir(path):
            fpath = os.path.join(path, fname)
            if os.path.isfile(fpath):
                with open(fpath, "r", encoding="utf-8") as f:
                    texts.append(f.read())
                    labels.append(label)
        return texts, labels

    train_pos, y_train_pos = read_folder(os.path.join(base_dir, "train", "pos"), 1)
    train_neg, y_train_neg = read_folder(os.path.join(base_dir, "train", "neg"), 0)
    test_pos, y_test_pos = read_folder(os.path.join(base_dir, "test", "pos"), 1)
    test_neg, y_test_neg = read_folder(os.path.join(base_dir, "test", "neg"), 0)

    X_train = train_pos + train_neg
    y_train = y_train_pos + y_train_neg
    X_test = test_pos + test_neg
    y_test = y_test_pos + y_test_neg
    return X_train, y_train, X_test, y_test

X_train_raw, y_train, X_test_raw, y_test = read_reviews(DATA_DIR)
print(f"Loaded: train={len(X_train_raw)}, test={len(X_test_raw)}")

Loaded: train=25000, test=25000


In [None]:
# Basic text cleaning
HTML_RE = re.compile(r"<.*?>")
PUNCT_TABLE = str.maketrans("", "", string.punctuation)

def clean_text(text, lowercase=True, remove_html=True, remove_punct=True, stopword_removal=False):
    if remove_html:
        text = HTML_RE.sub(" ", text)
    if lowercase:
        text = text.lower()
    if remove_punct:
        text = text.translate(PUNCT_TABLE)
    tokens = text.split()
    if stopword_removal and EN_STOPWORDS:
        tokens = [t for t in tokens if t not in EN_STOPWORDS]
    return " ".join(tokens)

# Apply cleaning (toggle stopword_removal=True if desired)
X_train = [clean_text(t, stopword_removal=False) for t in X_train_raw]
X_test = [clean_text(t, stopword_removal=False) for t in X_test_raw]


In [None]:
# Shared evaluation helper
def evaluate_and_print(name, y_true, y_pred, store_list):
    acc = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary", pos_label=1, zero_division=0)
    cm = confusion_matrix(y_true, y_pred)
    print(f"\n=== {name} ===")
    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-score:  {f1:.4f}")
    print("Confusion matrix:")
    print(cm)
    store_list.append({"Embedding": name, "Accuracy": acc, "Precision": precision, "Recall": recall, "F1": f1})


In [None]:
results = []

# Vectorize with TF (L1-normalized counts)
count_vec_tf = CountVectorizer()
X_train_counts = count_vec_tf.fit_transform(X_train)
X_test_counts = count_vec_tf.transform(X_test)
X_train_tf = normalize(X_train_counts.astype(float), norm="l1", axis=1)
X_test_tf = normalize(X_test_counts.astype(float), norm="l1", axis=1)

# Train Logistic Regression
logreg_tf = LogisticRegression(max_iter=5000, n_jobs=-1)
logreg_tf.fit(X_train_tf, y_train)
y_pred_tf = logreg_tf.predict(X_test_tf)

evaluate_and_print("TF (term frequency)", y_test, y_pred_tf, results)



=== TF (term frequency) ===
Accuracy:  0.7246
Precision: 0.7170
Recall:    0.7423
F1-score:  0.7294
Confusion matrix:
[[8837 3663]
 [3221 9279]]


In [None]:
# Binary presence + IDF transform (no normalization)
count_vec_bin = CountVectorizer(binary=True)
X_train_bin = count_vec_bin.fit_transform(X_train)
X_test_bin = count_vec_bin.transform(X_test)

idf_transformer = TfidfTransformer(use_idf=True, norm=None)
X_train_idf = idf_transformer.fit_transform(X_train_bin)
X_test_idf = idf_transformer.transform(X_test_bin)

logreg_idf = LogisticRegression(max_iter=5000, n_jobs=-1)
logreg_idf.fit(X_train_idf, y_train)
y_pred_idf = logreg_idf.predict(X_test_idf)

evaluate_and_print("IDF only (binary presence × IDF)", y_test, y_pred_idf, results)



=== IDF only (binary presence × IDF) ===
Accuracy:  0.8734
Precision: 0.8790
Recall:    0.8660
F1-score:  0.8725
Confusion matrix:
[[11010  1490]
 [ 1675 10825]]


In [None]:
tfidf_uni = TfidfVectorizer(ngram_range=(1,1), min_df=2)
X_train_tfidf_uni = tfidf_uni.fit_transform(X_train)
X_test_tfidf_uni = tfidf_uni.transform(X_test)

logreg_tfidf_uni = LogisticRegression(max_iter=5000, n_jobs=-1)
logreg_tfidf_uni.fit(X_train_tfidf_uni, y_train)
y_pred_tfidf_uni = logreg_tfidf_uni.predict(X_test_tfidf_uni)

evaluate_and_print("TF-IDF (unigram)", y_test, y_pred_tfidf_uni, results)



=== TF-IDF (unigram) ===
Accuracy:  0.8854
Precision: 0.8854
Recall:    0.8853
F1-score:  0.8854
Confusion matrix:
[[11068  1432]
 [ 1434 11066]]


In [None]:
tfidf_unibi = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_train_tfidf_unibi = tfidf_unibi.fit_transform(X_train)
X_test_tfidf_unibi = tfidf_unibi.transform(X_test)

logreg_tfidf_unibi = LogisticRegression(max_iter=5000, n_jobs=-1)
logreg_tfidf_unibi.fit(X_train_tfidf_unibi, y_train)
y_pred_tfidf_unibi = logreg_tfidf_unibi.predict(X_test_tfidf_unibi)

evaluate_and_print("TF-IDF (unigram+bigram)", y_test, y_pred_tfidf_unibi, results)



=== TF-IDF (unigram+bigram) ===
Accuracy:  0.8908
Precision: 0.8855
Recall:    0.8977
F1-score:  0.8915
Confusion matrix:
[[11049  1451]
 [ 1279 11221]]


In [None]:
# Tokenize for Word2Vec
def tokenize_for_w2v(text):
    return text.split()

train_tokens = [tokenize_for_w2v(t) for t in X_train]
test_tokens = [tokenize_for_w2v(t) for t in X_test]

# Train CBOW (sg=0)
w2v_cbow = Word2Vec(
    sentences=train_tokens,
    vector_size=200,
    window=5,
    min_count=2,
    workers=4,
    sg=0,
    epochs=10
)

# Build averaged document vectors
def doc_vector(tokens, model):
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.wv.vector_size, dtype=np.float32)
    return np.mean(vectors, axis=0)

X_train_cbow = np.vstack([doc_vector(toks, w2v_cbow) for toks in train_tokens])
X_test_cbow = np.vstack([doc_vector(toks, w2v_cbow) for toks in test_tokens])

logreg_cbow = LogisticRegression(max_iter=5000, n_jobs=-1)
logreg_cbow.fit(X_train_cbow, y_train)
y_pred_cbow = logreg_cbow.predict(X_test_cbow)

evaluate_and_print("Word2Vec CBOW (trained on IMDB)", y_test, y_pred_cbow, results)



=== Word2Vec CBOW (trained on IMDB) ===
Accuracy:  0.8476
Precision: 0.8485
Recall:    0.8463
F1-score:  0.8474
Confusion matrix:
[[10611  1889]
 [ 1921 10579]]


In [None]:
# Train Skip-gram (sg=1)
w2v_sg = Word2Vec(
    sentences=train_tokens,
    vector_size=200,
    window=5,
    min_count=2,
    workers=4,
    sg=1,
    epochs=10
)

X_train_sg = np.vstack([doc_vector(toks, w2v_sg) for toks in train_tokens])
X_test_sg = np.vstack([doc_vector(toks, w2v_sg) for toks in test_tokens])

logreg_sg = LogisticRegression(max_iter=5000, n_jobs=-1)
logreg_sg.fit(X_train_sg, y_train)
y_pred_sg = logreg_sg.predict(X_test_sg)

evaluate_and_print("Word2Vec Skip-gram (trained on IMDB)", y_test, y_pred_sg, results)



=== Word2Vec Skip-gram (trained on IMDB) ===
Accuracy:  0.8598
Precision: 0.8646
Recall:    0.8533
F1-score:  0.8589
Confusion matrix:
[[10829  1671]
 [ 1834 10666]]


In [None]:
# Choose one: 'word2vec-google-news-300' (large) or 'fasttext-wiki-news-subwords-300'
# 'glove-wiki-gigaword-300' is another option if the above aren't available.
try:
    pretrained = api.load("word2vec-google-news-300")  # ~1.5GB
    model_name = "Pretrained Word2Vec (GoogleNews)"
except Exception:
    pretrained = api.load("fasttext-wiki-news-subwords-300")
    model_name = "Pretrained FastText (WikiNews)"

def doc_vector_pretrained(tokens, keyed_vectors):
    vectors = []
    for w in tokens:
        if w in keyed_vectors:
            vectors.append(keyed_vectors[w])
    if len(vectors) == 0:
        return np.zeros(keyed_vectors.vector_size, dtype=np.float32)
    return np.mean(vectors, axis=0)

X_train_pre = np.vstack([doc_vector_pretrained(toks, pretrained) for toks in train_tokens])
X_test_pre = np.vstack([doc_vector_pretrained(toks, pretrained) for toks in test_tokens])

logreg_pre = LogisticRegression(max_iter=5000, n_jobs=-1)
logreg_pre.fit(X_train_pre, y_train)
y_pred_pre = logreg_pre.predict(X_test_pre)

evaluate_and_print(model_name, y_test, y_pred_pre, results)



=== Pretrained Word2Vec (GoogleNews) ===
Accuracy:  0.8483
Precision: 0.8560
Recall:    0.8376
F1-score:  0.8467
Confusion matrix:
[[10738  1762]
 [ 2030 10470]]


In [None]:
# General sentence embeddings (compact and strong)
st_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

X_train_st = st_model.encode(X_train, batch_size=64, convert_to_numpy=True, show_progress_bar=True)
X_test_st = st_model.encode(X_test, batch_size=64, convert_to_numpy=True, show_progress_bar=True)

logreg_st = LogisticRegression(max_iter=5000, n_jobs=-1)
logreg_st.fit(X_train_st, y_train)
y_pred_st = logreg_st.predict(X_test_st)

evaluate_and_print("BERT sentence embeddings (all-MiniLM-L6-v2)", y_test, y_pred_st, results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

Batches:   0%|          | 0/391 [00:00<?, ?it/s]


=== BERT sentence embeddings (all-MiniLM-L6-v2) ===
Accuracy:  0.8192
Precision: 0.8218
Recall:    0.8151
F1-score:  0.8184
Confusion matrix:
[[10290  2210]
 [ 2311 10189]]


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dabert_name = "textattack/bert-base-uncased-imdb"
dabert_tok = AutoTokenizer.from_pretrained(dabert_name)
dabert = AutoModel.from_pretrained(dabert_name).to(device)

@torch.no_grad()
def bert_embed(texts, tokenizer, model, max_length=128, batch_size=32):
    model.eval() # Set model to evaluation mode
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        encoded_input = tokenizer(batch_texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        model_output = model(**encoded_input)
        # Mean pooling: take the average of the last hidden state (token embeddings)
        # and apply attention mask to ignore padding tokens
        input_mask_expanded = encoded_input['attention_mask'].unsqueeze(-1).expand(model_output.last_hidden_state.size()).float()
        sum_embeddings = torch.sum(model_output.last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        batch_embeddings = (sum_embeddings / sum_mask).cpu().numpy()
        all_embeddings.extend(batch_embeddings)
    return np.array(all_embeddings)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/511 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]