<a href="https://colab.research.google.com/github/tugsukage/SentimentBert/blob/main/RandomF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Download the aclImdb dataset
!wget -nc http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

File ‘aclImdb_v1.tar.gz’ already there; not retrieving.



In [None]:
!pip install gensim sentence_transformers



In [None]:
import os, re, string, time, logging
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
!pip install gensim sentence_transformers
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

# -----------------------------
# Logging setup
# -----------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger(__name__)

# -----------------------------
# Load IMDB dataset
# -----------------------------
DATA_DIR = "/content/aclImdb"

def read_reviews(base_dir):
    def read_folder(path, label):
        texts, labels = [], []
        for fname in os.listdir(path):
            fpath = os.path.join(path, fname)
            if os.path.isfile(fpath):
                with open(fpath, "r", encoding="utf-8") as f:
                    texts.append(f.read())
                    labels.append(label)
        return texts, labels

    train_pos, y_train_pos = read_folder(os.path.join(base_dir, "train", "pos"), 1)
    train_neg, y_train_neg = read_folder(os.path.join(base_dir, "train", "neg"), 0)
    test_pos, y_test_pos = read_folder(os.path.join(base_dir, "test", "pos"), 1)
    test_neg, y_test_neg = read_folder(os.path.join(base_dir, "test", "neg"), 0)

    return train_pos+train_neg, y_train_pos+y_train_neg, test_pos+test_neg, y_test_pos+y_test_neg

X_train_raw, y_train, X_test_raw, y_test = read_reviews(DATA_DIR)
print(f"Loaded: train={len(X_train_raw)}, test={len(X_test_raw)}")
# -----------------------------
# Preprocessing
# -----------------------------
HTML_RE = re.compile(r"<.*?>")
PUNCT_TABLE = str.maketrans("", "", string.punctuation)

def clean_text(text):
    text = HTML_RE.sub(" ", text)
    text = text.lower()
    text = text.translate(PUNCT_TABLE)
    return text

X_train = [clean_text(t) for t in X_train_raw]
X_test = [clean_text(t) for t in X_test_raw]

# -----------------------------
# Evaluation helper
# -----------------------------
def evaluate(name, y_true, y_pred, results):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
    cm = confusion_matrix(y_true, y_pred)
    logger.info(f"[{name}] Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")
    logger.info(f"[{name}] Confusion matrix:\n{cm}")
    results.append({"Embedding":name,"Accuracy":acc,"Precision":prec,"Recall":rec,"F1":f1})

def tune_rf(X_train_emb, y_train, name):
    rf = RandomForestClassifier(random_state=42, n_jobs=-1)
    param_grid = {
        "n_estimators":[50],
        "max_depth":[10,20],
        "min_samples_split":[2,5],
        "min_samples_leaf":[5,10]
    }
    logger.info(f"[{name}] Starting GridSearchCV...")
    grid = GridSearchCV(rf, param_grid, scoring="f1", cv=3, n_jobs=-1, verbose=2)
    start = time.time()
    grid.fit(X_train_emb, y_train)
    end = time.time()
    logger.info(f"[{name}] Best params: {grid.best_params_}, Best CV F1={grid.best_score_:.4f}, Duration={(end-start):.2f}s")
    return grid.best_estimator_, grid.best_params_






Loaded: train=25000, test=25000


In [None]:
results = []
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(X_train)
X_test_counts = count_vec.transform(X_test)
X_train_tf = normalize(X_train_counts, norm="l1")
X_test_tf = normalize(X_test_counts, norm="l1")

rf_tf, params_tf = tune_rf(X_train_tf, y_train, "TF")
evaluate("TF", y_test, rf_tf.predict(X_test_tf), results)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [None]:
count_vec_bin = CountVectorizer(binary=True, max_features=10000)
X_train_bin = count_vec_bin.fit_transform(X_train)
X_test_bin = count_vec_bin.transform(X_test)
idf = TfidfTransformer(use_idf=True, norm=None)
X_train_idf = idf.fit_transform(X_train_bin)
X_test_idf = idf.transform(X_test_bin)

rf_idf, params_idf = tune_rf(X_train_idf, y_train, "IDF")
evaluate("IDF", y_test, rf_idf.predict(X_test_idf), results)


Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [None]:
tfidf_uni = TfidfVectorizer(ngram_range=(1,1))
X_train_uni = tfidf_uni.fit_transform(X_train)
X_test_uni = tfidf_uni.transform(X_test)

rf_uni, params_uni = tune_rf(X_train_uni, y_train, "TF-IDF unigram")
evaluate("TF-IDF unigram", y_test, rf_uni.predict(X_test_uni), results)


Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [None]:
train_tokens = [t.split() for t in X_train]
test_tokens = [t.split() for t in X_test]

w2v = Word2Vec(sentences=train_tokens, vector_size=100, window=5, min_count=2, sg=0, epochs=5)
def avg_vec(tokens, model):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.wv.vector_size)

X_train_w2v = np.vstack([avg_vec(t, w2v) for t in train_tokens])
X_test_w2v = np.vstack([avg_vec(t, w2v) for t in test_tokens])

rf_w2v, params_w2v = tune_rf(X_train_w2v, y_train, "Word2Vec CBOW")
evaluate("Word2Vec CBOW", y_test, rf_w2v.predict(X_test_w2v), results)


Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [None]:
st_model = SentenceTransformer("all-MiniLM-L6-v2")
X_train_st = st_model.encode(X_train, batch_size=64, convert_to_numpy=True)
X_test_st = st_model.encode(X_test, batch_size=64, convert_to_numpy=True)

rf_st, params_st = tune_rf(X_train_st, y_train, "BERT embeddings")
evaluate("BERT embeddings", y_test, rf_st.predict(X_test_st), results)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Fitting 3 folds for each of 8 candidates, totalling 24 fits


In [None]:
df = pd.DataFrame(results).sort_values(by="F1", ascending=False)
print("\n=== Final Comparison ===")
print(df.to_string(index=False))



=== Final Comparison ===
      Embedding  Accuracy  Precision  Recall       F1
            IDF   0.83280   0.810633 0.86848 0.838560
 TF-IDF unigram   0.82152   0.807263 0.84472 0.825567
             TF   0.82040   0.807793 0.84088 0.824004
BERT embeddings   0.76612   0.760473 0.77696 0.768628
  Word2Vec CBOW   0.75896   0.757395 0.76200 0.759691
