In [12]:
import tarfile
import os
import shutil
from urllib.request import urlopen
from contextlib import closing

#Data downloading 

#relative path of train/test data folder
imdb_train_data_folder = "./aclImdb/train"
imdb_test_data_folder = "./aclImdb/test"

URL="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
ARCHIVE_NAME = "aclImdb_v1.tar"

if not os.path.exists("aclImdb"):
    opener = urlopen(URL)
    
    #downloading and extract all files.
    with open(ARCHIVE_NAME, 'wb') as archive:
        archive.write(opener.read())
        
    with closing(tarfile.open(ARCHIVE_NAME, "r:gz")) as archive:
        archive.extractall(path='.')
        
    test_folder = os.listdir(imdb_test_data_folder)
    train_folder = os.listdir(imdb_train_data_folder)
    
    #remove .txt, .feat, and unsup folder.
    for item in train_folder:
        if (item.endswith(".feat") or item.endswith(".txt")):
            os.remove(os.path.join(imdb_train_data_folder, item))
    shutil.rmtree(os.path.join(imdb_train_data_folder,"unsup"))
    for item in test_folder:
        if (item.endswith(".feat") or item.endswith(".txt")):
            os.remove(os.path.join(imdb_test_data_folder, item))
    os.remove(ARCHIVE_NAME)
#remove archieve



In [13]:
import sys
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

#loads the data from folder
imdb_train_data = load_files(imdb_train_data_folder,shuffle=False)
print("train_samples: %d" % len(imdb_train_data.data))

imdb_test_data = load_files(imdb_test_data_folder,shuffle=False)
print("test_samples: %d" % len(imdb_test_data.data))


train_samples: 25000
test_samples: 25000


In [14]:
from sklearn.feature_extraction.text import CountVectorizer

#Builds a dictionary of features and transforms 
#documents to feature vectors: 
count_vect= CountVectorizer()
#second one exclude stopwords, like 'the','of'..
count_vect2= CountVectorizer(stop_words='english') 

X_train_counts = count_vect.fit_transform(imdb_train_data.data)
X_train_counts2 = count_vect2.fit_transform(imdb_train_data.data)

print(X_train_counts.shape)
print(X_train_counts2.shape)

(25000, 74849)
(25000, 74538)


In [15]:
from sklearn.feature_extraction.text import TfidfTransformer

#transform our count-matrix to a tf-idf representation
#Similarly. a suffix 2 meaning we remove the stopwords
tfidf_transformer = TfidfTransformer()
tfidf_transformer2 = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf2 = tfidf_transformer2.fit_transform(X_train_counts2)

print(X_train_tfidf.shape)
print(X_train_tfidf2.shape)


(25000, 74849)
(25000, 74538)


In [16]:
#import pandas as pd

# get the first document
first_vector=X_train_tfidf[10]
first_vector2=X_train_tfidf2[10]
 

The decision tree classifier part starts from below. RandomizedSearchCV is used here.

In [17]:
# Building a pipeline that behaves like a compound classifier
from sklearn import tree
from sklearn.pipeline import Pipeline
#text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('dt', tree.DecisionTreeClassifier())])
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')), ('tfidf', TfidfTransformer()), ('dt', tree.DecisionTreeClassifier())])

In [18]:
# Hyperparameter tuning using Randomized search
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
X = imdb_train_data.data
Y = imdb_train_data.target
# Create parameter for CountVectorizer
ngram_range = [(1, 1), (1, 2), (2, 2)]
# Create parameter for TfidfTransformer
use_idf = (True, False)
stop_words=['english', None]

# Create lists of parameter for Decision Tree Classifier
criterion = ['gini', 'entropy']
max_depth = [45,65,95,125]


n = len(X)
idx1 = np.arange(0, n, dtype=int)
idx1_train, idx1_test = train_test_split(idx1, test_size=0.2, shuffle = True, random_state = 123)
custom_cv = [(idx1_train, idx1_test)]

# Create a dictionary of all the parameter options. We can access parameters of steps of a pipeline by using '__’
parameters = dict(vect__ngram_range=ngram_range,
                  vect__stop_words=stop_words,
                  tfidf__use_idf = use_idf,
                      dt__criterion=criterion,
                      dt__max_depth=max_depth);
# Call RandomizedSearchCV
rs_clf = RandomizedSearchCV(text_clf, parameters, n_iter=96, cv=custom_cv, random_state=199, return_train_score=False, 
                            n_jobs=-1)
rs_clf.fit(X, Y)
# View Best Parameters
print('Best n-gram range:', rs_clf.best_estimator_.get_params()['vect__ngram_range'])
print('Best stop_words:', rs_clf.best_estimator_.get_params()['vect__stop_words'])
print('Best use_idf:', rs_clf.best_estimator_.get_params()['tfidf__use_idf'])
print('Best criterion:', rs_clf.best_estimator_.get_params()['dt__criterion'])
print('Best max_depth:', rs_clf.best_estimator_.get_params()['dt__max_depth'])
print(); print(rs_clf.best_estimator_.get_params()['dt'])

Best n-gram range: (1, 2)
Best stop_words: english
Best use_idf: False
Best criterion: gini
Best max_depth: 45

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=45, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')


In [19]:
# Let's see the prediction of the test set

docs_test = imdb_test_data.data
import numpy as np
from sklearn import metrics
print("Decision trees:")
predicted_dt1 = rs_clf.predict(docs_test)
print(np.mean(predicted_dt1 == imdb_test_data.target))
print(metrics.classification_report(imdb_test_data.target, predicted_dt1, target_names=imdb_test_data.target_names))


Decision trees:
0.72392
              precision    recall  f1-score   support

         neg       0.74      0.69      0.71     12500
         pos       0.71      0.76      0.73     12500

    accuracy                           0.72     25000
   macro avg       0.72      0.72      0.72     25000
weighted avg       0.72      0.72      0.72     25000

