In [29]:
import os
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer,HashingVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
import re
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

In [30]:
# read vocabulary
sw = stopwords.words('english')

def read_vocab(path):
    with open(path, 'r') as f:
        vocab = {}
        i = 0
        for word in filter(lambda x: x not in sw, f.read().split('\n')):
            vocab[word] = i
            i += 1
        return vocab

vocab = read_vocab('./aclImdb/imdb.vocab')

In [31]:
# read train directory
def read_all_file(path):
    with open(path, 'r') as f:
        return f.read()
    
def path_to_category(path):
    rating = int(re.search('\_(\d){1,2}\.', path).groups()[0])
    if rating <= 3: return 0
    elif rating >= 7: return 2
    else: return 1 
    

def read_texts_in_folder(path, category=None):
    files = os.listdir(path)
    df = pd.DataFrame(files, columns=['path'])
    if category == None:
        df['category'] = df.apply(lambda row: path_to_category(row['path']), axis=1)
    else:
        df['category'] = category
    df['text'] = df.apply(lambda row: read_all_file(path + "/" +row['path']), axis=1)
    return df

df_train_pos = read_texts_in_folder('./aclImdb/train/pos', 1)
df_train_neg = read_texts_in_folder('./aclImdb/train/neg', 0)

In [32]:
# read test directory
df_test_pos = read_texts_in_folder('./aclImdb/test/pos', 1)
df_test_neg = read_texts_in_folder('./aclImdb/test/neg', 0)

In [33]:
# concat all
df_train = pd.concat([df_train_pos, df_train_neg])
df_test = pd.concat([df_test_pos, df_test_neg])

In [34]:
# encode bag of words
vectorizer = CountVectorizer(analyzer='word', tokenizer=word_tokenize, vocabulary=vocab)
X_test = vectorizer.transform(df_test['text'])
y_test = np.array(df_test['category'])
X_train = vectorizer.transform(df_train['text'])
y_train = np.array(df_train['category'])

In [35]:
# model
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)

In [36]:
roc_auc_score(y_test, y_pred[:, 1])

0.93352531199999988

In [37]:
# encode bag of words with bigrams
vectorizer2 = CountVectorizer(analyzer='word', tokenizer=word_tokenize, ngram_range=(1, 2))
X_train = vectorizer2.fit_transform(df_train['text'])
X_test = vectorizer2.transform(df_test['text'])

In [38]:
model2 = LogisticRegression()
model2.fit(X_train, y_train)
y_pred = model2.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:, 1])

0.95861868160000019

In [41]:
threshold = 0.5
y_pred_binary = [1 if x > threshold else 0 for x in y_pred[:, 1]]

In [42]:
precision_score(y_test, y_pred_binary)

0.89784989209495647

In [43]:
recall_score(y_test, y_pred_binary)

0.89863999999999999

In [44]:
f1_score(y_test, y_pred_binary)

0.89824477230018795

In [None]:
vectorizer3 = CountVectorizer(analyzer='word', tokenizer=word_tokenize, ngram_range=(1, 3))
X_train = vectorizer3.fit_transform(df_train['text'])
X_test = vectorizer3.transform(df_test['text'])

In [None]:
model3 = LogisticRegression()
model3.fit(X_train, y_train)
y_pred = model3.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:, 1])

In [23]:
vectorizer4 = CountVectorizer(analyzer='char', tokenizer=word_tokenize, ngram_range=(1, 3))
X_train = vectorizer4.fit_transform(df_train['text'])
X_test = vectorizer4.transform(df_test['text'])

In [24]:
model4 = LogisticRegression()
model4.fit(X_train, y_train)
y_pred = model4.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:, 1])

0.90829010239999997

In [19]:
vectorizer5 = HashingVectorizer(analyzer='word', tokenizer=word_tokenize, ngram_range=(1, 2))
X_train = vectorizer5.fit_transform(df_train['text'])
X_test = vectorizer5.transform(df_test['text'])

In [20]:
model5 = LogisticRegression()
model5.fit(X_train, y_train)
y_pred = model5.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:, 1])

0.93109686400000014

In [21]:
vectorizer6 = HashingVectorizer(analyzer='word', tokenizer=word_tokenize, ngram_range=(1, 2), norm='l1')
X_train = vectorizer6.fit_transform(df_train['text'])
X_test = vectorizer6.transform(df_test['text'])

In [22]:
model6 = LogisticRegression()
model6.fit(X_train, y_train)
y_pred = model6.predict_proba(X_test)
roc_auc_score(y_test, y_pred[:, 1])

0.73723795199999997