In [1]:
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import scipy
import sklearn
import json
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter

# Load training and test data

In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving train.tsv to train.tsv
User uploaded file "train.tsv" with length 13049217 bytes


In [3]:
dataframe = pd.read_csv('./train.tsv', sep = '\t')
print(dataframe)

       label                                             review
0          0  Leaks: Liss seems to be totally incompetent: m...
1          1  Replacement Peeler: Loved my old one. Loaned i...
2          0  Not what I was expecting: I chose to rate this...
3          1  Watch face is hard to read: Although I don't o...
4          0  Disappointing: I was eager to read this book s...
...      ...                                                ...
29991      1  Love EW: I must admit that I am a total TV afi...
29992      1  Easy to follow and delicious recipes!: I compl...
29993      1  The Beauty and Mystery of Veronique: Perhaps t...
29994      1  I love it.: Brilliant, hilarious, quick and ea...
29995      0  broken...: bad choice...2d film would not play...

[29996 rows x 2 columns]


In [4]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving test.tsv to test.tsv
User uploaded file "test.tsv" with length 2618508 bytes


In [6]:
testdataframe = pd.read_csv('./test.tsv', sep = '\t')
print(testdataframe)

        id                                             review
0        1  Human Hurricane!: Would you like to sleep in t...
1        2  A Mom: I bought this with all kinds of expecta...
2        3  Good Read: I judge all books that I read by a ...
3        4  It's awesome: DVD set is exactly what you'd bu...
4        5  Great Movie!!!: This definatly the best Godzil...
...    ...                                                ...
5995  5996  Beautiful and Spiritual: This is a very beauti...
5996  5997  Another Cash In: This cd is pure dreck and it'...
5997  5998  Concept drawings-very good: The concept drawin...
5998  5999  I hear i all the time is awsome: this is great...
5999  6000  Not so great Performance: This mouse is very s...

[6000 rows x 2 columns]


# Feature Extraction method: Count Vectors & TF-IDF Vector

In [7]:
# create a dataframe using texts and lables
trainDF = pd.DataFrame()
trainDF['text'] = dataframe['review']
trainDF['label'] = dataframe['label']
testDF = pd.DataFrame()
testDF['text'] = testdataframe['review']

In [11]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y =sklearn.model_selection.train_test_split(trainDF['text'], trainDF['label'])

# label encode the target variable 
encoder = sklearn.preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

Count Vectors as features:

Count Vector is a matrix notation of the dataset in which every row represents a document from the corpus, every column represents a term from the corpus, and every cell represents the frequency count of a particular term in a particular document.


In [12]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

TF-IDF Vectors as features:

TF-IDF score represents the relative importance of a term in the document and the entire corpus. TF-IDF score is composed by two terms: the first computes the normalized Term Frequency (TF), the second term is the Inverse Document Frequency (IDF), computed as the logarithm of the number of the documents in the corpus divided by the number of documents where the specific term appears.

TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)

IDF(t) = log_e(Total number of documents / Number of documents with term t in it)

TF-IDF Vectors can be generated at different levels of input tokens:

a. Word Level TF-IDF : Matrix representing tf-idf scores of every term in different documents

b. N-gram Level TF-IDF : N-grams are the combination of N terms together. This Matrix representing tf-idf scores of N-grams

c. Character Level TF-IDF : Matrix representing tf-idf scores of character level n-grams in the corpus

In [15]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=15000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=15000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=15000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x) 



# Train and evaluate model

In [13]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return sklearn.metrics.accuracy_score(predictions, valid_y)

In [16]:
# Linear Classifier on Count Vectors
accuracy = train_model(sklearn.linear_model.LogisticRegression(C = 0.1, solver='liblinear'), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(sklearn.linear_model.LogisticRegression(C = 0.1, solver='liblinear'), xtrain_tfidf, train_y, xvalid_tfidf)
print( "LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(sklearn.linear_model.LogisticRegression(C = 0.1, solver='liblinear'), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(sklearn.linear_model.LogisticRegression(C = 0.1, solver='liblinear'), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.8762501666888919
LR, WordLevel TF-IDF:  0.8463795172689692
LR, N-Gram Vectors:  0.8357114281904254
LR, CharLevel Vectors:  0.8226430190692092
