In [None]:
from torchtext import data
import torch

In [None]:
TEXT = data.Field(include_lengths=True)
LABEL = data.LabelField(dtype=torch.float)

In [None]:
from torchtext.datasets import IMDB
import os

train_path = "imdb_train.pt"
test_path  = "imdb_test.pt"

if os.path.exists(train_path) and os.path.exists(test_path):
    train_dump = torch.load(train_path, weights_only=False)
    test_dump  = torch.load(test_path,  weights_only=False)
else:
    train_data, test_data = IMDB.splits(TEXT, LABEL)
    train_dump = [(ex.text, ex.label) for ex in train_data.examples]
    test_dump  = [(ex.text, ex.label) for ex in test_data.examples]

    torch.save(train_dump, train_path)
    torch.save(test_dump,  test_path)
    print("saved dumps")

X_train = [t for (t, y) in train_dump]
y_train = [y for (t, y) in train_dump]
X_test  = [t for (t, y) in test_dump]
y_test  = [y for (t, y) in test_dump]


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

y_val_bin = [1 if y == 'pos' else 0 for y in y_val]
print(f"\nValidation set class distribution:")
print(pd.Series(y_val_bin).value_counts())

Training set size: 20000
Validation set size: 5000
Test set size: 25000

Validation set class distribution:
1    2500
0    2500
Name: count, dtype: int64


In [None]:
# Count how many documents contain <br /> tags
def count_br_tags(documents):
    """Count how many documents contain <br /> tags."""
    count = 0
    for doc in documents:
        # Convert to string if it's a list of tokens
        if isinstance(doc, list):
            doc_str = ' '.join(doc)
        else:
            doc_str = str(doc)
        if '<br />' in doc_str or '<br/>' in doc_str or '<br>' in doc_str.lower():
            count += 1
    return count

# Check all datasets
train_br_count = count_br_tags(X_train)
val_br_count = count_br_tags(X_val)
test_br_count = count_br_tags(X_test)
total_br_count = train_br_count + val_br_count + test_br_count

print(f"Documents with <br /> tags:")
print(f"  Training set: {train_br_count} / {len(X_train)} ({100*train_br_count/len(X_train):.2f}%)")
print(f"  Validation set: {val_br_count} / {len(X_val)} ({100*val_br_count/len(X_val):.2f}%)")
print(f"  Test set: {test_br_count} / {len(X_test)} ({100*test_br_count/len(X_test):.2f}%)")
print(f"  Total: {total_br_count} / {len(X_train) + len(X_val) + len(X_test)} ({100*total_br_count/(len(X_train) + len(X_val) + len(X_test)):.2f}%)")


In [5]:
import nltk

stopwords = nltk.corpus.stopwords.words('english')

In [None]:
import re
import string

TOKEN_PATTERN = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")

def custom_standardization(input_data):
    lowercase = input_data.lower()
    stripped_html = re.sub('<br />', ' ', lowercase)
    return re.sub('[%s]' % re.escape(string.punctuation), '', stripped_html)

def tokenize(text: str):
    text = text.lower()
    tokens = TOKEN_PATTERN.findall(text)
    return [t for t in tokens if t not in stopwords]


In [7]:
import pandas as pd

y_train_bin = [1 if y == 'pos' else 0 for y in y_train]
pd.Series(y_train_bin).value_counts()

1    10000
0    10000
Name: count, dtype: int64

In [None]:
def normalize_tokens(tokens):
    norm = set()
    for token in tokens:
        # Apply custom standardization
        token = custom_standardization(token)
        if token not in stopwords and token.isalpha():
            norm.add(token)
    return norm

In [None]:
X_train_norm = [normalize_tokens(doc) for doc in X_train]
X_val_norm   = [normalize_tokens(doc) for doc in X_val]
X_test_norm  = [normalize_tokens(doc) for doc in X_test]

In [10]:
from collections import Counter

def compute_document_frequency(X_tokens):
    df = Counter()
    docs_norm = []
    for tokens in X_tokens:
        norm_tokens = normalize_tokens(tokens)
        docs_norm.append(norm_tokens)
        df.update(set(norm_tokens))
    return df, docs_norm

df, docs_norm = compute_document_frequency(X_train_norm)
print("Number of docs:", len(docs_norm))
print("Unique tokens after normalization:", len(df))


Number of docs: 20000
Unique tokens after normalization: 55953


In [11]:
def select_candidate_words(df, n, k):
    items = sorted(df.items(), key = lambda x: x[1])
    rare = {w for w,_ in items[:k]}
    common = {w for w,_ in items[-n:]}
    to_drop = rare | common
    candidates = {w for w,_ in items if w not in to_drop}
    return candidates

n = 100
k = 100

candidate_words = select_candidate_words(df, n, k)
print("Candidate words:", len(candidate_words))

Candidate words: 55753


In [12]:
import numpy as np

def entropy(p1):
    if p1 <= 0 or p1>= 1:
        return 0.0
    p0 = 1.0 - p1
    return -(p1 * np.log2(p1) + p0 * np.log2(p0))

In [13]:
from collections import defaultdict

inv = defaultdict(list)
for i, doc in enumerate(docs_norm):
    for w in set(doc):
        if w in candidate_words:
            inv[w].append(i)
print("Inv words:", len(inv))


Inv words: 55753


In [14]:
def information_gain(word, docs_norm, y):
    y = np.asanyarray(y)
    N = len(docs_norm)
    assert N == len(y)

    N1 = np.sum(y == 1)
    N0 = np.sum(y == 0)

    if N1 == 0 or N0 == 0:
        return 0.0

    doc_ids = inv.get(word, [])
    if not doc_ids:
        return 0.0
    doc_ids = np.asarray(doc_ids, dtype=np.int32)


    N11 = np.sum(y[doc_ids] == 1)
    N10 = np.sum(y[doc_ids] == 0)

    N01 = N1 -N11
    N00 = N0 - N10
    pY1 = N1 / N
    H_Y = entropy(pY1)
    pX1 = (N11 + N10) / N
    pX0 = 1.0 - pX1

    def cond_entropy(n_pos, n_neg):

        total = n_pos + n_neg
        if total == 0:
            return 0.0
        p1 = n_pos / total
        return entropy(p1)
    H_Y_given_X1 = cond_entropy(N11, N10)
    H_Y_given_X0 = cond_entropy(N01, N00)
    H_Y_given_X = pX1 * H_Y_given_X1 + pX0 * H_Y_given_X0
    IG = H_Y - H_Y_given_X
    return IG

In [15]:
def select_top_m_by_ig(candidate_words, docs_norm, y, m_features):
    ig_scores = []
    y = np.asarray(y)

    for i, w in enumerate(candidate_words):
        ig = information_gain(w, docs_norm, y)
        ig_scores.append((w, ig))


    ig_scores.sort(key=lambda x: x[1], reverse=True)

    top = ig_scores[:m_features]
    vocab = [w for w, _ in top]
    return vocab, ig_scores


In [16]:
m = 5000

vocab, ig_scores = select_top_m_by_ig(candidate_words, docs_norm, y_train_bin, m)
print("Final vocabulary size:", len(vocab))

Final vocabulary size: 5000


In [17]:
vocab = list(vocab)
word2idx = {w: i for i,w in enumerate(vocab)}
V = len(vocab)
V

5000

In [18]:
def vectorize_document(tokens, word2idx, V):
    vec = np.zeros(V, dtype = np.int8)
    for t in tokens:
        if t in word2idx:
            j = word2idx[t]
            vec[j] = 1
    return vec


In [19]:
def vectorize_corpus(list_of_token_lists, word2idx):
    V = len(word2idx)
    X_bin = np.zeros((len(list_of_token_lists), V), dtype=np.int8)
    for i, tokens in enumerate(list_of_token_lists):
        X_bin[i] = vectorize_document(tokens, word2idx, V)
    return X_bin

In [20]:
X_train_bin = vectorize_corpus(X_train_norm, word2idx)
X_val_bin   = vectorize_corpus(X_val_norm, word2idx)
X_test_bin  = vectorize_corpus(X_test_norm, word2idx)

In [21]:
y_test_bin = [1 if label == 'pos' else 0 for label in y_test]
pd.Series(y_test_bin).value_counts()

1    12500
0    12500
Name: count, dtype: int64

In [22]:
from scipy.sparse import csr_matrix

X_train_bin_sp = csr_matrix(X_train_bin)
X_val_bin_sp    = csr_matrix(X_val_bin)
X_test_bin_sp   = csr_matrix(X_test_bin)

In [23]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

base_stump =  DecisionTreeClassifier(max_depth=1, random_state=42)

ada = AdaBoostClassifier(
    estimator=base_stump,
    random_state=42
)

In [24]:
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from scipy.sparse import vstack



param_grid = {
    "n_estimators": [100, 200, 400],
    "learning_rate": [0.005, 0.01, 0.05],
}

X_gs = vstack([X_train_bin_sp, X_val_bin_sp])
y_gs = np.hstack([y_train_bin, y_val_bin])

n_train = X_train_bin_sp.shape[0]
n_val   = X_val_bin_sp.shape[0]

test_fold = np.concatenate([
    -1 * np.ones(n_train, dtype=int),
     np.zeros(n_val, dtype=int)
])

ps = PredefinedSplit(test_fold)

gs_ada = GridSearchCV(
    ada,
    param_grid,
    cv=ps,
    scoring="f1",
    n_jobs=-1
)

gs_ada.fit(X_gs, y_gs)


In [25]:
best_ada = gs_ada.best_estimator_
print(gs_ada.best_params_)

{'learning_rate': 0.05, 'n_estimators': 200}


In [26]:
from sklearn.metrics import accuracy_score, classification_report

pred_ada = best_ada.predict(X_test_bin_sp)
accuracy_ada = accuracy_score(y_test_bin, pred_ada)

print(classification_report(y_test_bin, pred_ada, target_names=["neg", "pos"]))

              precision    recall  f1-score   support

         neg       0.93      0.22      0.35     12500
         pos       0.56      0.98      0.71     12500

    accuracy                           0.60     25000
   macro avg       0.74      0.60      0.53     25000
weighted avg       0.74      0.60      0.53     25000



In [27]:
y_pred = best_ada.predict(X_test_bin_sp)

In [28]:
y_pred
pd.Series(y_pred).value_counts()

1    22085
0     2915
Name: count, dtype: int64

In [29]:
print("type(y_pred):", type(y_pred))
print("len(y_pred):", len(y_pred))

type(y_pred): <class 'numpy.ndarray'>
len(y_pred): 25000


In [30]:
from sklearn.metrics import classification_report


print(classification_report(y_test_bin, y_pred, target_names=["neg","pos"]))

              precision    recall  f1-score   support

         neg       0.93      0.22      0.35     12500
         pos       0.56      0.98      0.71     12500

    accuracy                           0.60     25000
   macro avg       0.74      0.60      0.53     25000
weighted avg       0.74      0.60      0.53     25000



In [31]:
y_pred_scores = best_ada.predict_proba(X_test_bin_sp)

In [32]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    criterion="entropy",
    max_features="sqrt",
    min_samples_split=2,
    min_samples_leaf=1,
    bootstrap=True,
    n_jobs=-1,
    random_state=42
)

In [None]:


param_grid = {
    "n_estimators": [100, 200, 400],
    "max_depth": [10, 20, None],
}

gs = GridSearchCV(
    rf,
    param_grid,
    cv=ps,
    scoring="f1",
    n_jobs=-1
)

gs.fit(X_gs, y_gs)

In [None]:
best_rf = gs.best_estimator_
print("Best params:", gs.best_params_)
print("Best val score:", gs.best_score_)

Best params: {'max_depth': None, 'n_estimators': 200}
Best val score: 0.8210611256808553


In [None]:
y_pred = best_rf.predict(X_test_bin_sp)
print("pred distribution:", np.unique(y_pred, return_counts=True))
print(classification_report(y_test_bin, y_pred, target_names=["neg","pos"]))

pred distribution: (array([0, 1]), array([12848, 12152]))
              precision    recall  f1-score   support

         neg       0.81      0.83      0.82     12500
         pos       0.83      0.81      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000

