In [2]:
import scipy
import sklearn
import json
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter

# Load training and test data

In [3]:
dataframe = pd.read_csv('./train.tsv', sep = '\t')
print(dataframe)

       label                                             review
0          0  Leaks: Liss seems to be totally incompetent: m...
1          1  Replacement Peeler: Loved my old one. Loaned i...
2          0  Not what I was expecting: I chose to rate this...
3          1  Watch face is hard to read: Although I don't o...
4          0  Disappointing: I was eager to read this book s...
...      ...                                                ...
29991      1  Love EW: I must admit that I am a total TV afi...
29992      1  Easy to follow and delicious recipes!: I compl...
29993      1  The Beauty and Mystery of Veronique: Perhaps t...
29994      1  I love it.: Brilliant, hilarious, quick and ea...
29995      0  broken...: bad choice...2d film would not play...

[29996 rows x 2 columns]


In [4]:
train_ratio = 0.8 # 80% for training, 20% for validation
random_seed = 100

train_dataframe = dataframe.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = dataframe.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))

training set size: 23997
validation set size: 5999


In [5]:
test_dataframe = pd.read_csv('./test.tsv', sep = '\t')
print (test_dataframe)

        id                                             review
0        1  Human Hurricane!: Would you like to sleep in t...
1        2  A Mom: I bought this with all kinds of expecta...
2        3  Good Read: I judge all books that I read by a ...
3        4  It's awesome: DVD set is exactly what you'd bu...
4        5  Great Movie!!!: This definatly the best Godzil...
...    ...                                                ...
5995  5996  Beautiful and Spiritual: This is a very beauti...
5996  5997  Another Cash In: This cd is pure dreck and it'...
5997  5998  Concept drawings-very good: The concept drawin...
5998  5999  I hear i all the time is awsome: this is great...
5999  6000  Not so great Performance: This mouse is very s...

[6000 rows x 2 columns]


# Feature selection method: the chi-square

In [6]:
import re

def process_text(text):
    for punctuations in [',', '.', '"', '!', '?', ':', ';', '-', '(', ')', '[', ']']:
        text = text.replace(punctuations, ' ')
    text = re.sub('\s+', ' ', text)
    text = text.lower().strip()
    return text

def get_single_word_frequency(filepath):
    word_freq = {}
    with open(filepath) as f:
        f.readline() # skip header (the first line) 
        for line in f:
            review_text = process_text(line.split('\t')[1])
            for word in review_text.split():
                if word not in word_freq:
                    word_freq[word] = 1
                else:
                    word_freq[word] += 1
    return word_freq

def get_single_word_doc_frequency_per_label(filepath, label):
    word_freq_per_label = {}
    with open(filepath) as f:
        f.readline() # skip header (the first line) 
        for line in f:
            sentiment_label = line.split('\t')[0].strip()
            if sentiment_label == label:
                review_text = process_text(line.split('\t')[1])
                for word in set(review_text.split()):
                    if word not in word_freq_per_label:
                        word_freq_per_label[word] = 1
                    else:
                        word_freq_per_label[word] += 1
    return word_freq_per_label

def feature_selection_chi2(review_filepath, num_features_to_select):
    num_reviews = 0
    num_positive_reviews = 0
    num_negative_reviews = 0
    with open(review_filepath) as f:
        f.readline() # skip header (the first line) 
        for line in f:
            num_reviews += 1
            if line.strip().split()[0] == '1':
                num_positive_reviews += 1
            else:
                num_negative_reviews += 1
    word_freq = get_single_word_frequency(review_filepath)
    positive_word_freq = get_single_word_doc_frequency_per_label(review_filepath, '1')
    negative_word_freq = get_single_word_doc_frequency_per_label(review_filepath, '0')
    
    chi2_dict = {}
    for word, freq in word_freq.items():
        if word in positive_word_freq and word in negative_word_freq:        
            contingency_table = np.zeros((2,2))
            contingency_table[0][0] = positive_word_freq[word]
            contingency_table[0][1] = negative_word_freq[word]
            contingency_table[1][0] = num_positive_reviews - positive_word_freq[word]
            contingency_table[1][1] = num_negative_reviews - negative_word_freq[word]

            chi2 = 0.0
            for i in range(2):
                for j in range(2):
                    expected_count = sum(contingency_table[i,:])*sum(contingency_table[:,j])/float(num_reviews)
                    chi2 += (contingency_table[i][j] - expected_count)**2 / expected_count

            chi2_dict[word] = chi2
    feature_set = set([])
    for word, chi2 in sorted(chi2_dict.items(), key = lambda x: x[1], reverse = True)[:num_features_to_select]:
        feature_set.add(word)
    return feature_set

In [7]:
num_features = 15000
feature_set = feature_selection_chi2('./train.tsv', num_features)
vectorizer = CountVectorizer(vocabulary = feature_set)
vectorizer.fit(train_dataframe['review'])

CountVectorizer(vocabulary={'#', '#1', '#10', '#15', '#2', '#4', '#5', '#6',
                            '#7', '#8', '#9', '$', '$$', '$$$', '$0', '$1',
                            '$10', '$100', '$11', '$12', '$13', '$14', '$15',
                            '$150', '$18', '$19', '$2', '$20', '$200', '$22', ...})

# Extract feature vectors for training, validation, and test data 

In [8]:
train_X = vectorizer.transform(train_dataframe['review'])
valid_X = vectorizer.transform(valid_dataframe['review'])
test_X = vectorizer.transform(test_dataframe['review'])
print (train_X.shape)
print (valid_X.shape)
print (test_X.shape)

(23997, 15000)
(5999, 15000)
(6000, 15000)


In [9]:
train_Y = train_dataframe['label']
model = LogisticRegression(C = 0.1, solver='liblinear')
model.fit(train_X, train_Y)

LogisticRegression(C=0.1, solver='liblinear')

# Train and evaluate model

In [10]:
# Train model on training set
train_Y = train_dataframe['label']
model = LogisticRegression(C = 0.1, solver='liblinear')
model.fit(train_X, train_Y)

LogisticRegression(C=0.1, solver='liblinear')

In [11]:
# Evaluate model on training set
train_Y_hat = model.predict(train_X)
train_Y = train_dataframe['label'].to_numpy()
accuracy = accuracy_score(train_Y, train_Y_hat)
print ('Logistic regression, accuracy on training set:', accuracy)

Logistic regression, accuracy on training set: 0.9343667958494812


In [12]:
# Evaluate model on validation set
valid_Y_hat = model.predict(valid_X)
valid_Y = valid_dataframe['label'].to_numpy()
accuracy = accuracy_score(valid_Y, valid_Y_hat)
print ('Logistic regression, accuracy on validation set:', accuracy)

Logistic regression, accuracy on validation set: 0.8826471078513085


# Prediction

In [13]:
# helper function: write out prediction values into a csv format file
# params:
#     df: dataframe, where each row is a test example, with column 'id' as data id
#     pred: a list or 1-d array of prediction values
#     filepath: the output file path
# return:
#     None

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('id', 'label'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['id'], pred[index]))
    print (len(df), 'predictions are written to', filepath)

In [14]:
# After experimentation on the validation set: retrain the final model on all training data, and predict labels for test data
all_train_Y = dataframe['label']
all_train_X = vectorizer.transform(dataframe['review'])
model.fit(all_train_X, all_train_Y)
test_Y_hat = model.predict(test_X)
write_test_prediction(test_dataframe, test_Y_hat, './logistic_regression_chisqr.csv')

6000 predictions are written to ./logistic_regression_chisqr.csv
