In [0]:
import tarfile
import os
import shutil
from urllib.request import urlopen
from contextlib import closing

#Data downloading 

#relative path of train/test data folder
imdb_train_data_folder = "./aclImdb/train"
imdb_test_data_folder = "./aclImdb/test"

URL="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
ARCHIVE_NAME = "aclImdb_v1.tar"

if not os.path.exists("aclImdb"):
    opener = urlopen(URL)
    
    #downloading and extract all files.
    with open(ARCHIVE_NAME, 'wb') as archive:
        archive.write(opener.read())
        
    with closing(tarfile.open(ARCHIVE_NAME, "r:gz")) as archive:
        archive.extractall(path='.')
        
    test_folder = os.listdir(imdb_test_data_folder)
    train_folder = os.listdir(imdb_train_data_folder)
    
    #remove .txt, .feat, and unsup folder.
    for item in train_folder:
        if (item.endswith(".feat") or item.endswith(".txt")):
            os.remove(os.path.join(imdb_train_data_folder, item))
    shutil.rmtree(os.path.join(imdb_train_data_folder,"unsup"))
    for item in test_folder:
        if (item.endswith(".feat") or item.endswith(".txt")):
            os.remove(os.path.join(imdb_test_data_folder, item))
    os.remove(ARCHIVE_NAME)
#remove archieve



In [9]:
import sys
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics

#loads the data from folder
imdb_train_data = load_files(imdb_train_data_folder,shuffle=False)
print("train_samples: %d" % len(imdb_train_data.data))

imdb_test_data = load_files(imdb_test_data_folder,shuffle=False)
print("test_samples: %d" % len(imdb_test_data.data))


train_samples: 25000
test_samples: 25000


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

#Builds a dictionary of features and transforms 
#documents to feature vectors: 
count_vect= CountVectorizer()
#second one exclude stopwords, like 'the','of'..
count_vect2= CountVectorizer(stop_words='english') 

X_train_counts = count_vect.fit_transform(imdb_train_data.data)
X_train_counts2 = count_vect2.fit_transform(imdb_train_data.data)

print(X_train_counts.shape)
print(X_train_counts2.shape)

(25000, 74849)
(25000, 74538)


In [11]:
from sklearn.feature_extraction.text import TfidfTransformer

#transform our count-matrix to a tf-idf representation
#Similarly. a suffix 2 meaning we remove the stopwords
tfidf_transformer = TfidfTransformer()
tfidf_transformer2 = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf2 = tfidf_transformer2.fit_transform(X_train_counts2)

print(X_train_tfidf.shape)
print(X_train_tfidf2.shape)


(25000, 74849)
(25000, 74538)


In [0]:
import pandas as pd

# get the first document
first_vector=X_train_tfidf[10]
first_vector2=X_train_tfidf2[10]
 
# show the TF-IDF scores , compare with/without stopwords
df = pd.DataFrame(first_vector.T.todense(), index=count_vect.get_feature_names(), columns=["tfidf"])
df2 = pd.DataFrame(first_vector2.T.todense(), index=count_vect2.get_feature_names(), columns=["tfidf_stopwords"])


In [13]:
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
the,0.252033
he,0.192326
accent,0.172488
of,0.167375
three,0.164270
...,...
flaw,0.000000
flavourless,0.000000
flavouring,0.000000
flavoured,0.000000


In [14]:
df2.sort_values(by=["tfidf_stopwords"],ascending=False)

Unnamed: 0,tfidf_stopwords
accent,0.214215
spite,0.169063
actresses,0.149836
decrescendos,0.147218
bleibtreau,0.147218
...,...
flavin,0.000000
flavia,0.000000
flava,0.000000
flav,0.000000


# Models

## Logistic Regression

In [0]:
from sklearn.linear_model import LogisticRegression
# from sklearn.pipeline import Pipeline
# text_clf_lr = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf_lr', LogisticRegression(multi_class = 'multinomial')),])
# text_clf_lr.fit(twenty_train.data, twenty_train.target)
clf_lr = LogisticRegression().fit(X_train_tfidf, imdb_train_data.target)

## SVM


In [0]:
from sklearn.linear_model import SGDClassifier
# text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf_svm', SGDClassifier()),])
# text_clf_svm.fit(twenty_train.data, twenty_train.target)
clf_svm = SGDClassifier().fit(X_train_tfidf, imdb_train_data.target)

## Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
# text_clf_rf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf_rf', RandomForestClassifier()),])
# text_clf_rf.fit(twenty_train.data, twenty_train.target)
clf_rf = RandomForestClassifier().fit(X_train_tfidf2, imdb_train_data.target)

## Evaluate

In [36]:
import numpy as np

X_test_counts = count_vect.transform(imdb_test_data.data)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

print("Logistic Regresssion:")
predicted_lr = clf_lr.predict(X_test_tfidf)
print(np.mean(predicted_lr == imdb_test_data.target))
from sklearn import metrics
print(metrics.classification_report(imdb_test_data.target, predicted_lr, target_names=imdb_test_data.target_names))

print("SVM:")
predicted_svm = clf_svm.predict(X_test_tfidf)
print(np.mean(predicted_svm == imdb_test_data.target))
from sklearn import metrics
print(metrics.classification_report(imdb_test_data.target, predicted_svm, target_names=imdb_test_data.target_names))

X_test_counts_rf = count_vect2.transform(imdb_test_data.data)
X_test_tfidf_rf = tfidf_transformer2.transform(X_test_counts_rf)
print("Random Forest:")
predicted_rf = clf_rf.predict(X_test_tfidf_rf)
print(np.mean(predicted_rf == imdb_test_data.target))
from sklearn import metrics
print(metrics.classification_report(imdb_test_data.target, predicted_rf, target_names=imdb_test_data.target_names))

Logistic Regresssion:
0.88316
              precision    recall  f1-score   support

         neg       0.88      0.88      0.88     12500
         pos       0.88      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000

SVM:
0.88532
              precision    recall  f1-score   support

         neg       0.89      0.88      0.88     12500
         pos       0.88      0.89      0.89     12500

    accuracy                           0.89     25000
   macro avg       0.89      0.89      0.89     25000
weighted avg       0.89      0.89      0.89     25000

Random Forest:
0.84976
              precision    recall  f1-score   support

         neg       0.85      0.86      0.85     12500
         pos       0.85      0.84      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted

In [0]:
 #   Training a classifier example:
 #   from sklearn.naive_bayes import MultinomialNB
 #   clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)
    


In [0]:
#  Predict an outcome on new document:
#  Doc_examples = ['God is love', 'GPU is fast']

#  convert documents into tf-idf.  
#  X_new_counts = count_vect.transform(Doc_examples) 
#  X_new_tfidf = tfidf_transformer.transform(X_new_counts)


#  predicted = clf.predict(X_new_tfidf)   (name_of_your_model_object)
#  it stores the prediction of doc_examples.


