In [1]:
%load_ext autoreload
%autoreload

from lib.dataset import download_tfds_imdb_as_text, download_tfds_imdb_as_text_tiny
from lib.classical_ml import run_pipeline



In [2]:
dataset  = download_tfds_imdb_as_text()
tiny_dataset = download_tfds_imdb_as_text_tiny()

# Experiment 1 Tokenizer

In [3]:
# approximate running time: 42 mins
    
print("Simple SpaCy tokenizer")
_, _ = run_pipeline(dataset)

print("Simple SpaCy tokenizer and ignore stop")
_, _ = run_pipeline(dataset, lower=True, ignore=["like_num", "is_stop"])

print("Simple SpaCy tokenizer, lowercase, lemma")
_, _ = run_pipeline(dataset, lower=True, lemma=True)
    


Simple SpaCy tokenizer
Best parameters set found on development set:  {'model__C': 0.1}
Best F1 on development set: 0.89 2
F1 on test set: 0.89
Simple SpaCy tokenizer and ignore stop
Best parameters set found on development set:  {'model__C': 0.1}
Best F1 on development set: 0.88 2
F1 on test set: 0.87
Simple SpaCy tokenizer, lowercase, lemma
Best parameters set found on development set:  {'model__C': 0.1}
Best F1 on development set: 0.89 2
F1 on test set: 0.88


```
Simple SpaCy tokenizer
Best parameters set found on development set:  {'model__C': 0.1}
Best F1 on development set: 0.89 2
F1 on test set: 0.89
Simple SpaCy tokenizer and ignore stop
Best parameters set found on development set:  {'model__C': 0.1}
Best F1 on development set: 0.88 2
F1 on test set: 0.87
Simple SpaCy tokenizer, lowercase, lemma
Best parameters set found on development set:  {'model__C': 0.1}
Best F1 on development set: 0.89 2
F1 on test set: 0.88
```

# Experiment 2 Vectorizer

Prerequisite: If you are not familiar with TFIDF, read see [this](https://nlp.stanford.edu/IR-book/pdf/06vect.pdf).

In this experiement, we will try different vectorization techniques; Bigrams, TFIDF and Binary. Although all of them are based one hot encoding, they capture slightly different information from text. 

- Bigrams:

- TFIDF: TFIDF is very common technique for Information Retrieval (IR) and has been long proved that it improves the IR performance. However, text classification and IR are two different problem sets, so this is not neccessary the case. For text classification, we can expect that whatever classification models we use can capture the same things as TFIDF, which is how importance words are, or how much information words contain.

- Binary:


In [4]:
# approximate running time: 82 mins
    
print("Simple SpaCy tokenizer")
_, _ = run_pipeline(dataset, lower=True, lemma=True, bigram=True)

print("Simple SpaCy tokenizer and ignore stop")
_, _ = run_pipeline(dataset, lower=True, lemma=True, tfidf=True)

print("Simple SpaCy tokenizer, lowercase, lemma")
_, _ = run_pipeline(dataset, lower=True, lemma=True, binary=True)
    


Simple SpaCy tokenizer
Best parameters set found on development set:  {'model__C': 0.1}
Best F1 on development set: 0.90 2
F1 on test set: 0.90
Simple SpaCy tokenizer and ignore stop
Best parameters set found on development set:  {'model__C': 10}
Best F1 on development set: 0.89 2
F1 on test set: 0.88
Simple SpaCy tokenizer, lowercase, lemma
Best parameters set found on development set:  {'model__C': 0.1}
Best F1 on development set: 0.89 2
F1 on test set: 0.88


```
Simple SpaCy tokenizer
Best parameters set found on development set:  {'model__C': 0.1}
Best F1 on development set: 0.90 2
F1 on test set: 0.90
Simple SpaCy tokenizer and ignore stop
Best parameters set found on development set:  {'model__C': 10}
Best F1 on development set: 0.89 2
F1 on test set: 0.88
Simple SpaCy tokenizer, lowercase, lemma
Best parameters set found on development set:  {'model__C': 0.1}
Best F1 on development set: 0.89 2
F1 on test set: 0.88
```

# Experiment 3 Model

- Naive Bayes
- Logisitc

In [5]:
# approximate running time: 3 mins

import time
now = time.time()
print("Simple NB")
_, _ = run_pipeline(dataset, use_nb=True)
    
    
print(time.time()-now)

Simple NB
Best parameters set found on development set:  {}
Best F1 on development set: 0.84 2
F1 on test set: 0.80
167.90781021118164


```
Simple SpaCy tokenizer, lowercase, lemma
Load tokenized document from disk
Load tokenized document from disk
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) {}
Best parameters set found on development set:  {}
Best F1 on development set: 0.84
F1 on test set: 0.80
164.55402326583862
```