In [None]:
# Download the aclImdb dataset
!wget -nc http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz

In [None]:
!pip install gensim sentence_transformers

In [None]:
import os, re, string, time, logging
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import normalize
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer

# Logging setup
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger(__name__)


In [None]:
DATA_DIR = "/content/aclImdb"  # change path if needed

def read_reviews(base_dir):
    def read_folder(path, label):
        texts, labels = [], []
        for fname in os.listdir(path):
            fpath = os.path.join(path, fname)
            if os.path.isfile(fpath):
                with open(fpath, "r", encoding="utf-8") as f:
                    texts.append(f.read())
                    labels.append(label)
        return texts, labels

    train_pos, y_train_pos = read_folder(os.path.join(base_dir, "train", "pos"), 1)
    train_neg, y_train_neg = read_folder(os.path.join(base_dir, "train", "neg"), 0)
    test_pos, y_test_pos = read_folder(os.path.join(base_dir, "test", "pos"), 1)
    test_neg, y_test_neg = read_folder(os.path.join(base_dir, "test", "neg"), 0)

    return train_pos+train_neg, y_train_pos+y_train_neg, test_pos+test_neg, y_test_pos+y_test_neg

X_train_raw, y_train, X_test_raw, y_test = read_reviews(DATA_DIR)
print(f"Loaded train={len(X_train_raw)}, test={len(X_test_raw)}")


In [6]:
HTML_RE = re.compile(r"<.*?>")
PUNCT_TABLE = str.maketrans("", "", string.punctuation)

def clean_text(text):
    text = HTML_RE.sub(" ", text)
    text = text.lower()
    text = text.translate(PUNCT_TABLE)
    return text

X_train = [clean_text(t) for t in X_train_raw]
X_test = [clean_text(t) for t in X_test_raw]


In [14]:
def evaluate_and_log(name, y_true, y_pred, results):
    acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average="binary")
    cm = confusion_matrix(y_true, y_pred)
    print(f"\n[{name}]")
    print(f"Accuracy={acc:.4f}, Precision={prec:.4f}, Recall={rec:.4f}, F1={f1:.4f}")
    print("Confusion matrix:\n", cm)
    results.append({"Embedding":name,"Accuracy":acc,"Precision":prec,"Recall":rec,"F1":f1})

def tune_adaboost(X_train_emb, y_train, name):
    base_tree = DecisionTreeClassifier(random_state=42)

    ada = AdaBoostClassifier(
        estimator=base_tree,   # ← ЭНЭ ХАМГИЙН ЧУХАЛ
        random_state=42
    )

    param_grid = {
        "n_estimators": [50],          # хурдны үүднээс багасгав
        "learning_rate": [0.5],
        "estimator__max_depth": [1]
    }

    print(f"\n[{name}] Starting GridSearchCV...")
    grid = GridSearchCV(
        ada,
        param_grid,
        scoring="f1",
        cv=3,
        n_jobs=1,        # Colab-д тогтвортой
        verbose=2
    )

    grid.fit(X_train_emb, y_train)

    print(f"[{name}] Best params: {grid.best_params_}, Best CV F1={grid.best_score_:.4f}")
    return grid.best_estimator_, grid.best_params_


In [None]:
results = []
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(X_train)
X_test_counts = count_vec.transform(X_test)
X_train_tf = normalize(X_train_counts, norm="l1")
X_test_tf = normalize(X_test_counts, norm="l1")

ada_tf, params_tf = tune_adaboost(X_train_tf, y_train, "TF")
evaluate_and_log("TF", y_test, ada_tf.predict(X_test_tf), results)


In [None]:
count_vec_bin = CountVectorizer(binary=True, max_features=10000)
X_train_bin = count_vec_bin.fit_transform(X_train)
X_test_bin = count_vec_bin.transform(X_test)
idf = TfidfTransformer(use_idf=True, norm=None)
X_train_idf = idf.fit_transform(X_train_bin)
X_test_idf = idf.transform(X_test_bin)

ada_idf, params_idf = tune_adaboost(X_train_idf, y_train, "IDF-only")
evaluate_and_log("IDF-only", y_test, ada_idf.predict(X_test_idf), results)


In [None]:
tfidf_uni = TfidfVectorizer(ngram_range=(1,1))
X_train_uni = tfidf_uni.fit_transform(X_train)
X_test_uni = tfidf_uni.transform(X_test)

ada_uni, params_uni = tune_adaboost(X_train_uni, y_train, "TF-IDF unigram")
evaluate_and_log("TF-IDF unigram", y_test, ada_uni.predict(X_test_uni), results)


In [None]:
train_tokens = [t.split() for t in X_train]
test_tokens = [t.split() for t in X_test]

w2v = Word2Vec(sentences=train_tokens, vector_size=100, window=5, min_count=2, sg=0, epochs=5)
def avg_vec(tokens, model):
    vecs = [model.wv[w] for w in tokens if w in model.wv]
    return np.mean(vecs, axis=0) if vecs else np.zeros(model.wv.vector_size)

X_train_w2v = np.vstack([avg_vec(t, w2v) for t in train_tokens])
X_test_w2v = np.vstack([avg_vec(t, w2v) for t in test_tokens])

ada_w2v, params_w2v = tune_adaboost(X_train_w2v, y_train, "Word2Vec CBOW")
evaluate_and_log("Word2Vec CBOW", y_test, ada_w2v.predict(X_test_w2v), results)


In [None]:
st_model = SentenceTransformer("all-MiniLM-L6-v2")
X_train_st = st_model.encode(X_train, batch_size=32, convert_to_numpy=True)
X_test_st = st_model.encode(X_test, batch_size=32, convert_to_numpy=True)

ada_st, params_st = tune_adaboost(X_train_st, y_train, "BERT embeddings")
evaluate_and_log("BERT embeddings", y_test, ada_st.predict(X_test_st), results)


In [21]:
df = pd.DataFrame(results).sort_values(by="F1", ascending=False)
print("\n=== Final Comparison ===")
print(df[["Embedding","Accuracy","F1"]].to_string(index=False))



=== Final Comparison ===
      Embedding  Accuracy       F1
BERT embeddings   0.75688 0.754801
             TF   0.71440 0.749086
 TF-IDF unigram   0.70456 0.748827
       IDF-only   0.71124 0.734781
  Word2Vec CBOW   0.73676 0.726419
