## Data Loading

In [2]:
import pandas as pd

data_pth = "./data/imdb"
train = pd.read_csv(f"{data_pth}/train.csv")
val = pd.read_csv(f"{data_pth}/val.csv")
test = pd.read_csv(f"{data_pth}/test.csv")

train.head(5)

Unnamed: 0,text,label
0,Everyone involved (and the audience) should se...,0
1,The Williams family live on a ranch located in...,1
2,This movie surprised me in a good way. From th...,1
3,Forget Neo and Bourne and all those half-baked...,1
4,I figured that any horror film with Orson Well...,0


### Cleaning

In [3]:
from preprocess import clean_text

if "text_cleaned" not in train.columns:
    train["text_cleaned"] = train["text"].map(lambda t: clean_text(t))
    train.to_csv(f"{data_pth}/train_cleaned.csv", index=False)
else:
    print("Loaded Train Cleaned Dataset")

if "text_cleaned" not in test.columns:
    test["text_cleaned"] = test["text"].map(lambda t: clean_text(t))
    test.to_csv(f"{data_pth}/test_cleaned.csv", index=False)
else:
    print("Loaded Test Cleaned Dataset")

if "text_cleaned" not in val.columns:
    val["text_cleaned"] = val["text"].map(lambda t: clean_text(t))
    val.to_csv(f"{data_pth}/val_cleaned.csv", index=False)
else:
    print("Loaded Val Cleaned Dataset")

In [3]:
print("Raw: ", train.iloc[0,0])
print("Cleaned: ", train.iloc[0,2])

Raw:  Everyone involved (and the audience) should seek out "The Candidate" to see how good this movie could have been. What happened the South American story? What were Julie Christie and Kate Capshaw thinking to allow their roles to be cardboard cut-outs. Up to now I have liked every Gene Hackman performance and/or movie. He was either disinterested (which I can hardly believe) or dreadfully miscast. I have also liked and defended Richard Gere (and been vilified for it). But here he had no "power". He was never intimidating and only occasionally persuasive. All in all I was very disappointed. I really expected much more from this director and cast. If you can't find "The Candidate" watch "Wag the dog" again or even "Bulworth".
Cleaned:  everyone involved and the audience should seek out the candidate to see how good this movie could have been what happened the south american story ? what were julie christie and kate capshaw thinking to allow their roles to be cardboard cutouts up to n

In [4]:
train_texts = [text for text in train["text_cleaned"]]
test_texts = [text for text in test["text_cleaned"]]
val_texts = [text for text in val["text_cleaned"]]

train_labels = [senti for senti in train["label"]]
test_labels = [senti for senti in test["label"]]
val_labels = [senti for senti in val["label"]]

splitted_texts, splitted_labels = {"train": train_texts, "test": test_texts, "val": val_texts},  {"train": train_labels, "test": test_labels, "val": val_labels}

## Model Variation

### Hand-crafting Features

In [5]:
from features import craft_features, vectorize_labels, FEAT_ARG

NGRAM_RANGE = (1,3)
MAX_TFIDF_FEATS = 2000
MIN_DF = 3
MAX_DF = 0.7

args = FEAT_ARG(NGRAM_RANGE, MIN_DF, MAX_DF, MAX_TFIDF_FEATS)

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


In [6]:
FEATURESET = "tfidf"
X_train, X_val, X_test = craft_features(featset=FEATURESET, text_splits=splitted_texts, feat_args=args)
y_train, y_val, y_test = vectorize_labels(splitted_labels)

print("Features:  Train {} , Val {} , Test {}".format(X_train.shape, X_val.shape, X_test.shape))

Fitting ngram tfidf over training set...
Fitted! Saving into ./models
Features:  Train (20000, 2000) , Val (5000, 2000) , Test (25000, 2000)


### Modeling (Logistic Regression)

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [9]:
for p in ["l1", "l2"]:
    for lambda_ in [0.1, 1.0, 5.0]:
        print(f"Model: {p} {lambda_}")
        lr = LogisticRegression(C=1/lambda_, penalty=p, solver="liblinear", max_iter=5000)
        lr.fit(X_train, y_train)
        train_pred = lr.predict(X_train)
        val_pred = lr.predict(X_val)
        test_pred = lr.predict(X_test)
        print("Trainset")
        print(classification_report(y_train, train_pred))
        print("Valset")
        print(classification_report(y_val, val_pred))
        print("Testset")
        print(classification_report(y_test, test_pred))

Model: l1 0.1
Trainset
              precision    recall  f1-score   support

           0       0.91      0.90      0.91     10000
           1       0.90      0.91      0.91     10000

    accuracy                           0.91     20000
   macro avg       0.91      0.91      0.91     20000
weighted avg       0.91      0.91      0.91     20000

Valset
              precision    recall  f1-score   support

           0       0.85      0.84      0.85      2500
           1       0.84      0.85      0.85      2500

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000

Testset
              precision    recall  f1-score   support

           0       0.87      0.85      0.86     12500
           1       0.86      0.87      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86   

In [9]:
# from sklearn.ensemble import GradientBoostingClassifier

# clf = GradientBoostingClassifier(n_estimators=100, 
#                                  learning_rate=1.,
#                                  max_depth=3, 
#                                  random_state=42,
#                                  verbose=1)
# clf.fit(X_train, y_train)
# train_pred = clf.predict(X_train)
# val_pred = clf.predict(X_val)
# test_pred = clf.predict(X_test)

# print("Trainset")
# print(classification_report(y_train, train_pred))
# print("Valset")
# print(classification_report(y_val, val_pred))
# print("Testset")
# print(classification_report(y_test, test_pred))

# import xgboost as xgb

# clf = xgb.XGBClassifier(n_estimators=1000,
#                         learning_rate=0.1,
#                         max_depth=3,
#                         random_state=42,
#                         verbosity=1,
#                         use_label_encoder=False,
#                         eval_metric='logloss')

# clf.fit(X_train, y_train)
# train_pred = clf.predict(X_train)
# val_pred = clf.predict(X_val)
# test_pred = clf.predict(X_test)

# print("Trainset")
# print(classification_report(y_train, train_pred))
# print("Valset")
# print(classification_report(y_val, val_pred))
# print("Testset")
# print(classification_report(y_test, test_pred))

## Feature Variation

In [7]:
NGRAM_RANGE = (1,3)
MIN_DF = 3
MAX_DF = 0.7

In [8]:
for FEATURESET in ["tfidf", "tfidf+lexicon"]:
    for MAX_TFIDF_FEATS in [200, 2000, 5000]:

        print(f"Feature set: {FEATURESET} N={MAX_TFIDF_FEATS}")

        args = FEAT_ARG(NGRAM_RANGE, MIN_DF, MAX_DF, MAX_TFIDF_FEATS)
        X_train, X_val, X_test = craft_features(featset=FEATURESET, text_splits=splitted_texts, feat_args=args)
        y_train, y_val, y_test = vectorize_labels(splitted_labels)
        print("Features:  Train {} , Val {} , Test {}".format(X_train.shape, X_val.shape, X_test.shape))
        
        
        lr = LogisticRegression(C=1., penalty="l2", solver="liblinear", max_iter=5000)
        lr.fit(X_train, y_train)
        train_pred = lr.predict(X_train)
        val_pred = lr.predict(X_val)
        test_pred = lr.predict(X_test)

        print("Trainset")
        print(classification_report(y_train, train_pred))
        print("Valset")
        print(classification_report(y_val, val_pred))
        print("Testset")
        print(classification_report(y_test, test_pred))

Feature set: tfidf N=200
Load a pre-trained vectorizer: tfidf_vectorizer_ngram(1, 3)_max_200_dfminmax_3_0.7.pickle
Features:  Train (20000, 200) , Val (5000, 200) , Test (25000, 200)
Trainset
              precision    recall  f1-score   support

           0       0.77      0.76      0.77     10000
           1       0.76      0.78      0.77     10000

    accuracy                           0.77     20000
   macro avg       0.77      0.77      0.77     20000
weighted avg       0.77      0.77      0.77     20000

Valset
              precision    recall  f1-score   support

           0       0.76      0.75      0.75      2500
           1       0.75      0.76      0.76      2500

    accuracy                           0.76      5000
   macro avg       0.76      0.76      0.76      5000
weighted avg       0.76      0.76      0.76      5000

Testset
              precision    recall  f1-score   support

           0       0.76      0.76      0.76     12500
           1       0.76      0