In [97]:
import os
import math
import re
import json

import random

import gensim
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer

# from sklearn.decomposition import PCA, KernelPCA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

import matplotlib.pyplot as plt

import numpy as np

In [98]:
%matplotlib inline
random.seed(1)
np.random.seed(1)

In [137]:
w2v_model_file = 'imdb_review_w2v.model' # change each time
d2vdm_model_file = 'imdb_review_d2vdm.model' # change each time
d2vdbow_model_file = 'imdb_review_d2vdbow.model' # change each time
train_csv = 'train_df.csv'
test_csv = 'test_df.csv'
df_csv = 'df.csv'
df_pkl = 'df.pkl'

neg_bound = 4
pos_bound = 7

train_size = 0.80

num_reviews = 10000

min_occ = 5 # The minimum number of occurrences for a word to be considered

In [138]:
def my_train_test_split(*args):
    return train_test_split(*args, train_size=train_size, random_state=1)

In [139]:
def get_w2v_vector(word):
    """Get the vector for a word"""
    try:
        return model.wv[word]
    except:
        print(word)
        raise
        
def filter_tokens(tokens, vocab=None):
    if vocab is None:
        vocab = w2v_vocab
    return [token for token in tokens if token in vocab]

In [140]:
en_stop = set(nltk.corpus.stopwords.words('english'))
stemmer = WordNetLemmatizer()

def tokenize(text):
    text = text.lower()
    # Remove non-word characters
    text = re.sub(r'[^a-z]', ' ', text)
    # Remove single letters
    text = re.sub(r'\b[a-z]{0,3}\b', ' ', text)
    # Merge multiple spaces
    text = re.sub(r'\s+', ' ', text)
    
    # Lemmatization
    tokens = text.split()
    tokens = [stemmer.lemmatize(word) for word in tokens]
    tokens = [word for word in tokens if word not in en_stop]
    
    return tokens

# Run once

In [141]:
def load_train_or_test(dir):
    """
    Return the negative and positive train or test data
    """
    def load_neg_or_pos(sub):
        res = []
        for file_name in os.listdir(sub):
            with open(sub + file_name, encoding='utf8') as file:
                underscore_ind = file_name.index('_')
                period_ind = file_name.index('.')
                id = int(file_name[:underscore_ind])
                rating = int(file_name[underscore_ind + 1:period_ind])
                text = next(file)
                res.append([id, rating, text])
        return res
    # Only choose more polar ratings
    neg = [[id, rating, text] for id, rating, text in load_neg_or_pos(dir + '/neg/') if rating <= neg_bound]
    pos = [[id, rating, text] for id, rating, text in load_neg_or_pos(dir + '/pos/') if rating >= pos_bound]
    random.shuffle(neg)
    random.shuffle(pos)
    both = neg[:num_reviews // 2] + pos[:num_reviews // 2]
    random.shuffle(both)
    return pd.DataFrame(both, columns=['Id', 'Rating', 'Text'])

In [142]:
df = load_train_or_test('./train') #.append(load_train_or_test('./test'))

In [143]:
df

Unnamed: 0,Id,Rating,Text
0,10252,3,"""A scientist discovers signals from space that..."
1,1387,8,A bit slow (somehow like a Sofia Coppola movie...
2,4368,2,This has been put out on the DVD market by Alp...
3,5485,10,One of the most timely and engrossing document...
4,9059,3,I'll just be vague about my potential spoiling...
...,...,...,...
9995,6706,1,I saw this move several years ago at the Centr...
9996,9580,3,Former brat pack actor and all round pretty bo...
9997,4522,1,The most misogynistic movie of all time? Not t...
9998,6740,2,Too Much of Something Borrowed Grade B-<br /><...


In [144]:
df['Tokens'] = df['Text'].apply(tokenize)
# Clean up the text too
df['Text'] = df['Tokens'].apply(" ".join)

In [145]:
# # Train and save model
# model = Word2Vec(sentences=train_df['Tokens'])
# model.save(w2v_model_file)

In [146]:
# w2v_vocab = set(model.wv.key_to_index.keys())
# w2v_vocab_ord = np.array(list(model.wv.key_to_index.keys()))

In [147]:
# # Keep only tokens that showed up the required number of times
# train_df['Tokens'] = train_df['Tokens'].apply(filter_tokens)

# test_df['Tokens'] = test_df['Text'].apply(lambda text: filter_tokens(tokenize(text)))
# # Process test text too
# test_df['Text'] = test_df['Tokens'].apply(" ".join)

In [148]:
# # The vectors corresponding to each reviews' words
# df['Vectors'] = df['Tokens'].apply(get_vector)

In [149]:
# Train Doc2Vec model
import collections
from gensim.models.callbacks import CallbackAny2Vec

tagged_docs = [TaggedDocument(words=tokens, tags=[id]) for id, tokens in zip(df['Id'], df['Tokens'])]
assert type(tagged_docs[0].words) == list
# print(len(tagged_docs), type(tagged_docs[0].words), tagged_docs[0])

class PrintLoss(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
    
    def on_epoch_begin(self, model):
        model.running_training_loss = 0.0
        print(f'Starting epoch {self.epoch}', end=' | ')
    
    def on_epoch_end(self, model):
        print(f'Finished epoch {self.epoch}, loss = {model.get_latest_training_loss()}')
        self.epoch += 1

def train_d2v(model):
    model.random.seed(1)
    model.build_vocab(tagged_docs)
    model.running_training_loss = 0.0
#     print(model.__dict__.keys())
    model.train(tagged_docs, total_examples=model.corpus_count, epochs=model.epochs, compute_loss=True, callbacks=[PrintLoss()])
#     for epoch in range(model.epochs):
#         print('Starting epoch', epoch, model.compute_loss)
#         model.train(tagged_docs, total_examples=model.corpus_count, epochs=2, compute_loss=True, callbacks=[])
#         print('Loss:', model.get_latest_training_loss())
#         break
    return model

d2vdm_model = train_d2v(Doc2Vec(dm=1, vector_size=5, min_count=min_occ, epochs=5, compute_loss=True))
d2vdm_model.save(d2vdm_model_file)
print('Trained d2vdm')

d2vdbow_model = train_d2v(Doc2Vec(dm=0, vector_size=50, min_count=min_occ, epochs=5, compute_loss=True))
d2vdbow_model.save(d2vdbow_model_file)
print('Trained d2vdbow')

Starting epoch 0 | Finished epoch 0, loss = 0.0
Starting epoch 1 | Finished epoch 1, loss = 0.0
Starting epoch 2 | Finished epoch 2, loss = 0.0
Starting epoch 3 | Finished epoch 3, loss = 0.0
Starting epoch 4 | Finished epoch 4, loss = 0.0
Trained d2vdm
Starting epoch 0 | Finished epoch 0, loss = 0.0
Starting epoch 1 | Finished epoch 1, loss = 0.0
Starting epoch 2 | Finished epoch 2, loss = 0.0
Starting epoch 3 | Finished epoch 3, loss = 0.0
Starting epoch 4 | Finished epoch 4, loss = 0.0
Trained d2vdbow


In [110]:
def test_d2v(model):
    ranks = []
    second_ranks = []
    for ind in range(len(tagged_docs)):
        inferred_vector = model.infer_vector(tagged_docs[ind].words)
        sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
        rank = [docid for docid, sim in sims].index(ind)
        ranks.append(rank)

        second_ranks.append(sims[1])

    counter = collections.Counter(ranks)
    print(len(counter))
    return counter

c1 = test_d2v(d2vdm_model)
c2 = test_d2v(d2vdbow_model)

In [150]:
# Save processed data
df.to_csv(df_csv)
df.to_pickle(df_pkl)

# Load stuff done already

In [151]:
# df = pd.read_csv(df_csv)
df = pd.read_pickle(df_pkl)

In [86]:
# Load pretrained Word2Vec model
# model = Word2Vec.load(w2v_model_file)

In [8]:
# Load pretrained Doc2Vec models
d2vdm_model = Doc2Vec.load(d2vdm_model_file)
d2vdbow_model = Doc2Vec.load(d2vdbow_model_file)

# Common stuff

In [9]:
# vocab = set(model.wv.key_to_index.keys())
# vocab_ord = np.array(list(model.wv.key_to_index.keys()))

In [11]:
y_bi = df['Rating'] > 5
y_train_bi, y_test_bi = my_train_test_split(y_bi)

y_bin2 = df['Rating'] // 2
y_train_bin2, y_test_bin2 = my_train_test_split(y_bin2)

# Classifiers

In [12]:
classifier_results = dict()

def score_classifier(classifier, X_test, y_test=y_test_bi, big_table=True):
    score = classifier.score(X_test, y_test)
    predicted = classifier.predict(X_test)
    cm = confusion_matrix(predicted, y_test)
    report = classification_report(predicted, y_test)
    
    classifier_results[str(classifier)] = (score, cm)
    
    print_results(score, cm, report, big_table)
    
def print_results(score, cm, class_report, big_table=True):
    print('Mean accuracy:', score)
    print(f"TP: {cm[0][0]}, FN: {cm[0][1]}\nFP: {cm[1][0]}, TN: {cm[1][1]}")
    if big_table:
        print(class_report)

In [22]:
def try_classifier(classifier, X_train, X_test, big_table=True):
    classifier.fit(X_train, y_train_bi)
    score_classifier(classifier, X_test, big_table=big_table)

def try_tfidf_classifier(classifier, big_table=True):
    classifier.fit(X_train_tfidf, y_train_bi)
    score_classifier(classifier, X_test_tfidf, big_table=big_table)

def try_d2v_classifier(classifier, big_table=True):
    classifier.fit(X_train_d2v, y_train_bi)
    score_classifier(classifier, X_test_d2v, big_table=big_table)

In [15]:
# Make different train-test splits for tf-idf
def make_tfidf(**kwargs):
    # en_stop because the default apparently has problems
    tfidf_vectorizer = TfidfVectorizer(stop_words=en_stop, min_df=min_occ, **kwargs)
    X_tfidf = tfidf_vectorizer.fit_transform(df['Text'])
    X_train_tfidf, X_test_tfidf = my_train_test_split(X_tfidf)
    
    print(X_train_tfidf.shape, len(tfidf_vectorizer.vocabulary_.keys()))
    
    return X_train_tfidf, X_test_tfidf

X_train_tfidf_1, X_test_tfidf_1 = make_tfidf(ngram_range=(1, 1))
X_train_tfidf_1_2, X_test_tfidf_1_2 = make_tfidf(ngram_range=(1, 2))
X_train_tfidf_2, X_test_tfidf_2 = make_tfidf(ngram_range=(2, 2))

# The "best" tf-idf model
X_train_tfidf = X_train_tfidf_1_2

(12800, 18596) 18596
(12800, 52817) 52817
(12800, 34221) 34221


In [16]:
# Make different train-test splits for Doc2Vec
def split_d2v(model):
    train_ids, test_ids = my_train_test_split(df['Id'])
    X_train = model.dv[train_ids]
    X_test = model.dv[test_ids]
    return X_train, X_test

X_train_d2vdm, X_test_d2vdm = split_d2v(d2vdm_model)
X_train_d2vdbow, X_test_d2vdbow = split_d2v(d2vdbow_model)

# Logistic Regression + Bag of Words

In [17]:
def try_bow(**kwargs):
    cnt_vectorizer = CountVectorizer(stop_words=en_stop, **kwargs) # en_stop because the default has problems
    X_bow = cnt_vectorizer.fit_transform(df['Text'])
    X_train_bow, X_test_bow = train_test_split(X_bow, train_size=train_size, random_state=1)

    # Scale data
    scaler_bow = StandardScaler(with_mean=False).fit(X_train_bow)
    X_train_bow_scaled = scaler_bow.transform(X_train_bow)
    X_test_bow_scaled = scaler_bow.transform(X_test_bow)
    print(X_train_bow_scaled.shape, len(cnt_vectorizer.vocabulary_.keys()))
    
    lr_bow = LogisticRegression()
    lr_bow.fit(X_train_bow_scaled, y_train_bi)
    
    score_classifier(lr_bow, X_test_bow_scaled)

In [18]:
try_bow(min_df=5, ngram_range=(1, 1)) # Just unigrams

(12800, 18596) 18596
Mean accuracy: 0.84375
TP: 1352, FN: 251
FP: 249, TN: 1348
              precision    recall  f1-score   support

       False       0.84      0.84      0.84      1603
        True       0.84      0.84      0.84      1597

    accuracy                           0.84      3200
   macro avg       0.84      0.84      0.84      3200
weighted avg       0.84      0.84      0.84      3200



In [19]:
try_bow(min_df=5, ngram_range=(1, 2)) # Unigrams and bigrams

(12800, 52817) 52817
Mean accuracy: 0.8575
TP: 1352, FN: 207
FP: 249, TN: 1392
              precision    recall  f1-score   support

       False       0.84      0.87      0.86      1559
        True       0.87      0.85      0.86      1641

    accuracy                           0.86      3200
   macro avg       0.86      0.86      0.86      3200
weighted avg       0.86      0.86      0.86      3200



In [20]:
try_bow(min_df=5, ngram_range=(2, 2)) # Just bigrams

(12800, 34221) 34221
Mean accuracy: 0.7759375
TP: 1216, FN: 332
FP: 385, TN: 1267
              precision    recall  f1-score   support

       False       0.76      0.79      0.77      1548
        True       0.79      0.77      0.78      1652

    accuracy                           0.78      3200
   macro avg       0.78      0.78      0.78      3200
weighted avg       0.78      0.78      0.78      3200



In [21]:
try_bow(min_df=5, ngram_range=(1, 3)) # Unigrams, bigrams, and trigrams

(12800, 55215) 55215
Mean accuracy: 0.8571875
TP: 1352, FN: 208
FP: 249, TN: 1391
              precision    recall  f1-score   support

       False       0.84      0.87      0.86      1560
        True       0.87      0.85      0.86      1640

    accuracy                           0.86      3200
   macro avg       0.86      0.86      0.86      3200
weighted avg       0.86      0.86      0.86      3200



# Logistic Regression + TFIDF

In [23]:
def try_tfidf(X_train_tfidf, X_test_tfidf):    
    lr_tfidf = LogisticRegression()
    lr_tfidf.fit(X_train_tfidf, y_train_bi)
    
    score_classifier(lr_tfidf, X_test_tfidf)

In [24]:
X_train_tfidf_1, X_test_tfidf_1 = make_tfidf(ngram_range=(1, 1))
X_train_tfidf_1_2, X_test_tfidf_1_2 = make_tfidf(ngram_range=(1, 2))
X_train_tfidf_2, X_test_tfidf_2 = make_tfidf(ngram_range=(2, 2))

(12800, 18596) 18596
(12800, 52817) 52817
(12800, 34221) 34221


In [25]:
try_tfidf(X_train_tfidf_1, X_test_tfidf_1)

Mean accuracy: 0.8778125
TP: 1385, FN: 175
FP: 216, TN: 1424
              precision    recall  f1-score   support

       False       0.87      0.89      0.88      1560
        True       0.89      0.87      0.88      1640

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [26]:
try_tfidf(X_train_tfidf_1_2, X_test_tfidf_1_2)

Mean accuracy: 0.88125
TP: 1385, FN: 164
FP: 216, TN: 1435
              precision    recall  f1-score   support

       False       0.87      0.89      0.88      1549
        True       0.90      0.87      0.88      1651

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [27]:
try_tfidf(X_train_tfidf_2, X_test_tfidf_2)

Mean accuracy: 0.8215625
TP: 1265, FN: 235
FP: 336, TN: 1364
              precision    recall  f1-score   support

       False       0.79      0.84      0.82      1500
        True       0.85      0.80      0.83      1700

    accuracy                           0.82      3200
   macro avg       0.82      0.82      0.82      3200
weighted avg       0.82      0.82      0.82      3200



In [37]:
# Choose the best tf-idf model
X_train_tfidf = X_train_tfidf_1_2
X_test_tfidf = X_test_tfidf_1_2

# Logistic Regression + Doc2Vec

In [29]:
score_classifier(LogisticRegression().fit(X_train_d2vdm, y_train_bi), X_test_d2vdm)

Mean accuracy: 0.5125
TP: 845, FN: 804
FP: 756, TN: 795
              precision    recall  f1-score   support

       False       0.53      0.51      0.52      1649
        True       0.50      0.51      0.50      1551

    accuracy                           0.51      3200
   macro avg       0.51      0.51      0.51      3200
weighted avg       0.51      0.51      0.51      3200



In [30]:
score_classifier(LogisticRegression().fit(X_train_d2vdbow, y_train_bi), X_test_d2vdbow)

Mean accuracy: 0.5203125
TP: 818, FN: 752
FP: 783, TN: 847
              precision    recall  f1-score   support

       False       0.51      0.52      0.52      1570
        True       0.53      0.52      0.52      1630

    accuracy                           0.52      3200
   macro avg       0.52      0.52      0.52      3200
weighted avg       0.52      0.52      0.52      3200



# Random forests + tf-idf

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
try_tfidf_classifier(RandomForestClassifier(max_depth=2, random_state=1))

Mean accuracy: 0.78625
TP: 1215, FN: 298
FP: 386, TN: 1301
              precision    recall  f1-score   support

       False       0.76      0.80      0.78      1513
        True       0.81      0.77      0.79      1687

    accuracy                           0.79      3200
   macro avg       0.79      0.79      0.79      3200
weighted avg       0.79      0.79      0.79      3200



In [39]:
try_tfidf_classifier(RandomForestClassifier(max_depth=3, random_state=1))

Mean accuracy: 0.795
TP: 1212, FN: 267
FP: 389, TN: 1332
              precision    recall  f1-score   support

       False       0.76      0.82      0.79      1479
        True       0.83      0.77      0.80      1721

    accuracy                           0.80      3200
   macro avg       0.80      0.80      0.79      3200
weighted avg       0.80      0.80      0.80      3200



In [40]:
try_tfidf_classifier(RandomForestClassifier(max_depth=5, random_state=1))

Mean accuracy: 0.806875
TP: 1231, FN: 248
FP: 370, TN: 1351
              precision    recall  f1-score   support

       False       0.77      0.83      0.80      1479
        True       0.84      0.79      0.81      1721

    accuracy                           0.81      3200
   macro avg       0.81      0.81      0.81      3200
weighted avg       0.81      0.81      0.81      3200



In [41]:
try_tfidf_classifier(RandomForestClassifier(max_depth=5, min_samples_leaf=5, random_state=1))

Mean accuracy: 0.8025
TP: 1214, FN: 245
FP: 387, TN: 1354
              precision    recall  f1-score   support

       False       0.76      0.83      0.79      1459
        True       0.85      0.78      0.81      1741

    accuracy                           0.80      3200
   macro avg       0.80      0.80      0.80      3200
weighted avg       0.81      0.80      0.80      3200



In [42]:
try_tfidf_classifier(RandomForestClassifier(max_depth=7, min_samples_leaf=5, n_estimators=200, random_state=1))

Mean accuracy: 0.82625
TP: 1281, FN: 236
FP: 320, TN: 1363
              precision    recall  f1-score   support

       False       0.80      0.84      0.82      1517
        True       0.85      0.81      0.83      1683

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



In [43]:
try_tfidf_classifier(RandomForestClassifier(max_depth=9, min_samples_leaf=5, n_estimators=200, random_state=1))

Mean accuracy: 0.8278125
TP: 1277, FN: 227
FP: 324, TN: 1372
              precision    recall  f1-score   support

       False       0.80      0.85      0.82      1504
        True       0.86      0.81      0.83      1696

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



In [44]:
try_tfidf_classifier(RandomForestClassifier(max_depth=15, min_samples_leaf=5, n_estimators=300, random_state=1))

Mean accuracy: 0.8375
TP: 1283, FN: 202
FP: 318, TN: 1397
              precision    recall  f1-score   support

       False       0.80      0.86      0.83      1485
        True       0.87      0.81      0.84      1715

    accuracy                           0.84      3200
   macro avg       0.84      0.84      0.84      3200
weighted avg       0.84      0.84      0.84      3200



In [45]:
try_tfidf_classifier(RandomForestClassifier(max_depth=15, min_samples_leaf=10, n_estimators=300, random_state=2))

Mean accuracy: 0.8425
TP: 1298, FN: 201
FP: 303, TN: 1398
              precision    recall  f1-score   support

       False       0.81      0.87      0.84      1499
        True       0.87      0.82      0.85      1701

    accuracy                           0.84      3200
   macro avg       0.84      0.84      0.84      3200
weighted avg       0.84      0.84      0.84      3200



In [46]:
try_tfidf_classifier(RandomForestClassifier(max_depth=15, min_samples_leaf=15, n_estimators=300, random_state=2))

Mean accuracy: 0.834375
TP: 1280, FN: 209
FP: 321, TN: 1390
              precision    recall  f1-score   support

       False       0.80      0.86      0.83      1489
        True       0.87      0.81      0.84      1711

    accuracy                           0.83      3200
   macro avg       0.83      0.84      0.83      3200
weighted avg       0.84      0.83      0.83      3200



In [47]:
try_tfidf_classifier(RandomForestClassifier(max_depth=16, n_estimators=400, random_state=2), big_table=False)

Mean accuracy: 0.8425
TP: 1304, FN: 207
FP: 297, TN: 1392


In [48]:
try_tfidf_classifier(RandomForestClassifier(max_depth=20, n_estimators=600, random_state=1), big_table=False)

Mean accuracy: 0.8428125
TP: 1314, FN: 216
FP: 287, TN: 1383


In [49]:
try_tfidf_classifier(RandomForestClassifier(max_depth=20, n_estimators=800, random_state=1), big_table=False)

Mean accuracy: 0.846875
TP: 1318, FN: 207
FP: 283, TN: 1392


# Random Forest + Doc2Vec (DM)

In [51]:
try_classifier(RandomForestClassifier(max_depth=2, random_state=1), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.47125
TP: 838, FN: 929
FP: 763, TN: 670
              precision    recall  f1-score   support

       False       0.52      0.47      0.50      1767
        True       0.42      0.47      0.44      1433

    accuracy                           0.47      3200
   macro avg       0.47      0.47      0.47      3200
weighted avg       0.48      0.47      0.47      3200



In [52]:
try_classifier(RandomForestClassifier(max_depth=3, random_state=1), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.463125
TP: 805, FN: 922
FP: 796, TN: 677
              precision    recall  f1-score   support

       False       0.50      0.47      0.48      1727
        True       0.42      0.46      0.44      1473

    accuracy                           0.46      3200
   macro avg       0.46      0.46      0.46      3200
weighted avg       0.47      0.46      0.46      3200



In [53]:
try_classifier(RandomForestClassifier(max_depth=5, random_state=1), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.4203125
TP: 755, FN: 1009
FP: 846, TN: 590
              precision    recall  f1-score   support

       False       0.47      0.43      0.45      1764
        True       0.37      0.41      0.39      1436

    accuracy                           0.42      3200
   macro avg       0.42      0.42      0.42      3200
weighted avg       0.43      0.42      0.42      3200



In [54]:
try_classifier(RandomForestClassifier(max_depth=5, min_samples_leaf=5, random_state=1), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.4184375
TP: 767, FN: 1027
FP: 834, TN: 572
              precision    recall  f1-score   support

       False       0.48      0.43      0.45      1794
        True       0.36      0.41      0.38      1406

    accuracy                           0.42      3200
   macro avg       0.42      0.42      0.42      3200
weighted avg       0.43      0.42      0.42      3200



In [55]:
try_classifier(RandomForestClassifier(max_depth=7, min_samples_leaf=5, n_estimators=200, random_state=1), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.3515625
TP: 619, FN: 1093
FP: 982, TN: 506
              precision    recall  f1-score   support

       False       0.39      0.36      0.37      1712
        True       0.32      0.34      0.33      1488

    accuracy                           0.35      3200
   macro avg       0.35      0.35      0.35      3200
weighted avg       0.35      0.35      0.35      3200



In [56]:
try_classifier(RandomForestClassifier(max_depth=9, min_samples_leaf=5, n_estimators=200, random_state=1), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.3028125
TP: 528, FN: 1158
FP: 1073, TN: 441
              precision    recall  f1-score   support

       False       0.33      0.31      0.32      1686
        True       0.28      0.29      0.28      1514

    accuracy                           0.30      3200
   macro avg       0.30      0.30      0.30      3200
weighted avg       0.30      0.30      0.30      3200



In [57]:
try_classifier(RandomForestClassifier(max_depth=15, min_samples_leaf=5, n_estimators=300, random_state=1), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.269375
TP: 437, FN: 1174
FP: 1164, TN: 425
              precision    recall  f1-score   support

       False       0.27      0.27      0.27      1611
        True       0.27      0.27      0.27      1589

    accuracy                           0.27      3200
   macro avg       0.27      0.27      0.27      3200
weighted avg       0.27      0.27      0.27      3200



In [58]:
try_classifier(RandomForestClassifier(max_depth=15, min_samples_leaf=10, n_estimators=300, random_state=2), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.26875
TP: 455, FN: 1194
FP: 1146, TN: 405
              precision    recall  f1-score   support

       False       0.28      0.28      0.28      1649
        True       0.25      0.26      0.26      1551

    accuracy                           0.27      3200
   macro avg       0.27      0.27      0.27      3200
weighted avg       0.27      0.27      0.27      3200



In [59]:
try_classifier(RandomForestClassifier(max_depth=15, min_samples_leaf=15, n_estimators=300, random_state=2), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.2771875
TP: 455, FN: 1167
FP: 1146, TN: 432
              precision    recall  f1-score   support

       False       0.28      0.28      0.28      1622
        True       0.27      0.27      0.27      1578

    accuracy                           0.28      3200
   macro avg       0.28      0.28      0.28      3200
weighted avg       0.28      0.28      0.28      3200



In [60]:
try_classifier(RandomForestClassifier(max_depth=16, n_estimators=400, random_state=2), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.2690625
TP: 446, FN: 1184
FP: 1155, TN: 415
              precision    recall  f1-score   support

       False       0.28      0.27      0.28      1630
        True       0.26      0.26      0.26      1570

    accuracy                           0.27      3200
   macro avg       0.27      0.27      0.27      3200
weighted avg       0.27      0.27      0.27      3200



In [61]:
try_classifier(RandomForestClassifier(max_depth=20, n_estimators=600, random_state=1), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.2728125
TP: 452, FN: 1178
FP: 1149, TN: 421
              precision    recall  f1-score   support

       False       0.28      0.28      0.28      1630
        True       0.26      0.27      0.27      1570

    accuracy                           0.27      3200
   macro avg       0.27      0.27      0.27      3200
weighted avg       0.27      0.27      0.27      3200



In [62]:
try_classifier(RandomForestClassifier(max_depth=20, n_estimators=800, random_state=1), X_train_d2vdm, X_test_d2vdm)

Mean accuracy: 0.27625
TP: 461, FN: 1176
FP: 1140, TN: 423
              precision    recall  f1-score   support

       False       0.29      0.28      0.28      1637
        True       0.26      0.27      0.27      1563

    accuracy                           0.28      3200
   macro avg       0.28      0.28      0.28      3200
weighted avg       0.28      0.28      0.28      3200



# SVM + tf-idf

In [86]:
try_classifier(SVC(kernel='linear', random_state=1))

TP: 1370, FN: 209
FP: 183, TN: 1438
              precision    recall  f1-score   support

       False       0.88      0.87      0.87      1579
        True       0.87      0.89      0.88      1621

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [87]:
try_classifier(SVC(kernel='rbf', random_state=1))

TP: 1368, FN: 189
FP: 185, TN: 1458
              precision    recall  f1-score   support

       False       0.88      0.88      0.88      1557
        True       0.89      0.89      0.89      1643

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [111]:
try_classifier(SVC(kernel='poly', degree=2, random_state=1))

TP: 1348, FN: 188
FP: 205, TN: 1459
              precision    recall  f1-score   support

       False       0.87      0.88      0.87      1536
        True       0.89      0.88      0.88      1664

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [147]:
try_classifier(SVC(kernel='poly', degree=3, random_state=1))

TP: 1339, FN: 257
FP: 214, TN: 1390
              precision    recall  f1-score   support

       False       0.86      0.84      0.85      1596
        True       0.84      0.87      0.86      1604

    accuracy                           0.85      3200
   macro avg       0.85      0.85      0.85      3200
weighted avg       0.85      0.85      0.85      3200



In [112]:
try_classifier(SVC(kernel='sigmoid', random_state=1))

TP: 1371, FN: 211
FP: 182, TN: 1436
              precision    recall  f1-score   support

       False       0.88      0.87      0.87      1582
        True       0.87      0.89      0.88      1618

    accuracy                           0.88      3200
   macro avg       0.88      0.88      0.88      3200
weighted avg       0.88      0.88      0.88      3200



In [105]:
try_classifier(SVC(kernel='linear', max_iter=1000, random_state=1))



TP: 1272, FN: 277
FP: 281, TN: 1370
              precision    recall  f1-score   support

       False       0.82      0.82      0.82      1549
        True       0.83      0.83      0.83      1651

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



In [106]:
try_classifier(SVC(kernel='rbf', max_iter=1000, random_state=1))



TP: 1301, FN: 318
FP: 252, TN: 1329
              precision    recall  f1-score   support

       False       0.84      0.80      0.82      1619
        True       0.81      0.84      0.82      1581

    accuracy                           0.82      3200
   macro avg       0.82      0.82      0.82      3200
weighted avg       0.82      0.82      0.82      3200



In [107]:
try_classifier(SVC(kernel='poly', degree=2, max_iter=1000, random_state=1))



TP: 1302, FN: 334
FP: 251, TN: 1313
              precision    recall  f1-score   support

       False       0.84      0.80      0.82      1636
        True       0.80      0.84      0.82      1564

    accuracy                           0.82      3200
   macro avg       0.82      0.82      0.82      3200
weighted avg       0.82      0.82      0.82      3200



In [108]:
try_classifier(SVC(kernel='poly', degree=3, max_iter=1000, random_state=1))



TP: 928, FN: 162
FP: 625, TN: 1485
              precision    recall  f1-score   support

       False       0.60      0.85      0.70      1090
        True       0.90      0.70      0.79      2110

    accuracy                           0.75      3200
   macro avg       0.75      0.78      0.75      3200
weighted avg       0.80      0.75      0.76      3200



In [148]:
try_classifier(SVC(kernel='poly', degree=4, max_iter=1000, random_state=1))



TP: 440, FN: 97
FP: 1113, TN: 1550
              precision    recall  f1-score   support

       False       0.28      0.82      0.42       537
        True       0.94      0.58      0.72      2663

    accuracy                           0.62      3200
   macro avg       0.61      0.70      0.57      3200
weighted avg       0.83      0.62      0.67      3200



In [109]:
try_classifier(SVC(kernel='sigmoid', max_iter=1000, random_state=1))



TP: 1264, FN: 267
FP: 289, TN: 1380
              precision    recall  f1-score   support

       False       0.81      0.83      0.82      1531
        True       0.84      0.83      0.83      1669

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



# Extra Trees + tf-idf

In [130]:
from sklearn.ensemble import ExtraTreesClassifier

In [131]:
try_classifier(ExtraTreesClassifier(max_depth=5, n_estimators=100, random_state=1))

0.78125
TP: 1355, FN: 502
FP: 198, TN: 1145
              precision    recall  f1-score   support

       False       0.87      0.73      0.79      1857
        True       0.70      0.85      0.77      1343

    accuracy                           0.78      3200
   macro avg       0.78      0.79      0.78      3200
weighted avg       0.80      0.78      0.78      3200



In [132]:
try_classifier(ExtraTreesClassifier(max_depth=10, n_estimators=100, random_state=1))

0.825
TP: 1301, FN: 308
FP: 252, TN: 1339
              precision    recall  f1-score   support

       False       0.84      0.81      0.82      1609
        True       0.81      0.84      0.83      1591

    accuracy                           0.82      3200
   macro avg       0.83      0.83      0.82      3200
weighted avg       0.83      0.82      0.82      3200



In [133]:
try_classifier(ExtraTreesClassifier(max_depth=15, n_estimators=100, random_state=1))

0.829375
TP: 1284, FN: 277
FP: 269, TN: 1370
              precision    recall  f1-score   support

       False       0.83      0.82      0.82      1561
        True       0.83      0.84      0.83      1639

    accuracy                           0.83      3200
   macro avg       0.83      0.83      0.83      3200
weighted avg       0.83      0.83      0.83      3200



In [134]:
try_classifier(ExtraTreesClassifier(max_depth=5, n_estimators=200, random_state=1))

0.805625
TP: 1394, FN: 463
FP: 159, TN: 1184
              precision    recall  f1-score   support

       False       0.90      0.75      0.82      1857
        True       0.72      0.88      0.79      1343

    accuracy                           0.81      3200
   macro avg       0.81      0.82      0.80      3200
weighted avg       0.82      0.81      0.81      3200



In [135]:
try_classifier(ExtraTreesClassifier(max_depth=5, n_estimators=500, random_state=1))

0.8178125
TP: 1409, FN: 439
FP: 144, TN: 1208
              precision    recall  f1-score   support

       False       0.91      0.76      0.83      1848
        True       0.73      0.89      0.81      1352

    accuracy                           0.82      3200
   macro avg       0.82      0.83      0.82      3200
weighted avg       0.83      0.82      0.82      3200



In [137]:
try_classifier(ExtraTreesClassifier(max_depth=10, n_estimators=200, random_state=1))

0.8375
TP: 1339, FN: 306
FP: 214, TN: 1341
              precision    recall  f1-score   support

       False       0.86      0.81      0.84      1645
        True       0.81      0.86      0.84      1555

    accuracy                           0.84      3200
   macro avg       0.84      0.84      0.84      3200
weighted avg       0.84      0.84      0.84      3200



In [138]:
try_classifier(ExtraTreesClassifier(max_depth=15, n_estimators=400, random_state=1))

0.8521875
TP: 1333, FN: 253
FP: 220, TN: 1394
              precision    recall  f1-score   support

       False       0.86      0.84      0.85      1586
        True       0.85      0.86      0.85      1614

    accuracy                           0.85      3200
   macro avg       0.85      0.85      0.85      3200
weighted avg       0.85      0.85      0.85      3200



In [139]:
try_classifier(ExtraTreesClassifier(max_depth=15, n_estimators=500, random_state=1))

0.8559375
TP: 1339, FN: 247
FP: 214, TN: 1400
              precision    recall  f1-score   support

       False       0.86      0.84      0.85      1586
        True       0.85      0.87      0.86      1614

    accuracy                           0.86      3200
   macro avg       0.86      0.86      0.86      3200
weighted avg       0.86      0.86      0.86      3200



In [140]:
try_classifier(ExtraTreesClassifier(max_depth=20, n_estimators=800, random_state=1))

0.861875
TP: 1335, FN: 224
FP: 218, TN: 1423
              precision    recall  f1-score   support

       False       0.86      0.86      0.86      1559
        True       0.86      0.87      0.87      1641

    accuracy                           0.86      3200
   macro avg       0.86      0.86      0.86      3200
weighted avg       0.86      0.86      0.86      3200



In [144]:
try_classifier(ExtraTreesClassifier(max_depth=25, n_estimators=1000, random_state=1))

0.8628125
TP: 1335, FN: 221
FP: 218, TN: 1426
              precision    recall  f1-score   support

       False       0.86      0.86      0.86      1556
        True       0.87      0.87      0.87      1644

    accuracy                           0.86      3200
   macro avg       0.86      0.86      0.86      3200
weighted avg       0.86      0.86      0.86      3200



In [145]:
try_classifier(ExtraTreesClassifier(max_depth=30, n_estimators=1500, random_state=1))

0.86875
TP: 1344, FN: 211
FP: 209, TN: 1436
              precision    recall  f1-score   support

       False       0.87      0.86      0.86      1555
        True       0.87      0.87      0.87      1645

    accuracy                           0.87      3200
   macro avg       0.87      0.87      0.87      3200
weighted avg       0.87      0.87      0.87      3200



In [14]:
classifier_results

{}