### set up dependencies

In [1]:
import numpy as np
import pandas as pd

import re
import nltk
nltk.download('stopwords')

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/teodorastereciu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### get data

In [2]:
# load dataset into memory
def load_data (filename):
    df = pd.read_csv(filename)
    corpus = df["text"]
    target = df["label"]
    return corpus, target

corpus_valid, target_valid = load_data("IMDB/Valid.csv")
corpus_train, target_train = load_data("IMDB/Train.csv")
corpus_test, target_test = load_data("IMDB/Test.csv")

In [3]:
print("Possible sentiments are", np.unique(target_train))
print("The number of reviews for training is", len(corpus_train))
size = len(corpus_train) + len(corpus_valid) + len(corpus_test)
print("The train / valid / test split is", str(len(corpus_train)/size) + " / " + str(len(corpus_valid)/size) + " / " + str(len(corpus_test)/size))
info = pd.DataFrame([corpus_train[5]], columns=["raw text example"]) # use to track progress
info

Possible sentiments are [0 1]
The number of reviews for training is 40000
The train / valid / test split is 0.8 / 0.1 / 0.1


Unnamed: 0,raw text example
0,A terrible movie as everyone has said. What ma...


### clean up data

In [4]:
stopwords = nltk.corpus.stopwords.words("english")
# turn the dataset into clean tokens
def clean_data(doc, vocab):
    #doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc)
    doc = re.sub(r'<[^>]+>', '', doc)  # remove HTML tags
    doc = re.sub(r'\W+', ' ', doc) # remove special characters
    
    # convert to lower case tokens excluding stopwords
    tokens = [word.lower() for word in doc.split() if word.lower() not in stopwords]

    # vocab += [word for word in tokens if word not in vocab]
    clean_doc = " ".join(tokens)

    return clean_doc

In [5]:
vocab_train = []
X_train = []
y_train = target_train
line = None
clean_line = None
for line in corpus_train:
    clean_line = clean_data(line, vocab_train)
    X_train.append(clean_line)

In [6]:
vocab_valid = []
X_valid = []
y_valid = target_valid
line = None
clean_line = None
for line in corpus_valid:
    clean_line = clean_data(line, vocab_valid)
    X_valid.append(clean_line)

In [7]:
vocab_test = []
X_test = []
y_test = target_test
line = None
clean_line = None
for line in corpus_test:
    clean_line = clean_data(line, vocab_test)
    X_test.append(clean_line)

In [8]:
info["clean text example"] = X_train[5]
info["sentiment"] = y_train[5]
info["% pos reviews"] = int(100*np.sum(y_train)/len(y_train))
review_len = [sum(1 for word in review.split()) for review in X_train]
info["num words in corpus"] = np.sum(review_len)
info["avg num words per review"] = np.mean(review_len)
info["max num words per review"] = np.max(review_len)
info["min num words per review"] = np.min(review_len)
info

Unnamed: 0,raw text example,clean text example,sentiment,% pos reviews,num words in corpus,avg num words per review,max num words per review,min num words per review
0,A terrible movie as everyone has said. What ma...,terrible movie everyone said made laugh cameo ...,0,49,4786521,119.663025,1429,3


## TF-IDF

In [12]:
pipeline_tfidf = Pipeline([
        ("vect", TfidfVectorizer()),
        ("clf", MLPClassifier()),
    ])
pipeline_tfidf

### Hyperparameter search

Since we'll be using k-fold cross-validation for this, we'll merge the original training set with the original validation set. 

In [13]:
parameter_grid_tfidf = {
    'vect__min_df' : [0.0, 0.2],
    'clf__hidden_layer_sizes': [(100,), (200,)],
    'clf__alpha': [1e-4, 1e-3],
}

grid_search_tfidf = GridSearchCV(
    estimator = pipeline_tfidf,
    param_grid = parameter_grid,
    cv = 5,
    verbose = 1,
)

In [11]:
#X_train_90 = np.concatenate([X_train, X_valid])
#y_train_90 = np.concatenate([y_train, y_valid])

In [15]:
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

grid_search_tfidf.fit(X_train[0:100], y_train[0:100])

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [16]:
print("Best parameters combination found:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in parameter_grid.keys():
    print(f"{param_name}: {best_parameters[param_name]}")

Best parameters combination found:
vect__min_df: 0.0
clf__hidden_layer_sizes: (200,)
clf__alpha: 0.0001


# Word2Vec

In [1]:
from gensim.models import Word2Vec