In [1]:
import scipy
import sklearn
import json
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from collections import Counter

# Download files, set up folder, put files into folder

In [2]:
# training data: ./train.tsv
# test data:     ./test.tsv

# Load training and test data

In [3]:
dataframe = pd.read_csv('./train.tsv', sep = '\t')
print(dataframe)

       label                                             review
0          0  Leaks: Liss seems to be totally incompetent: m...
1          1  Replacement Peeler: Loved my old one. Loaned i...
2          0  Not what I was expecting: I chose to rate this...
3          1  Watch face is hard to read: Although I don't o...
4          0  Disappointing: I was eager to read this book s...
...      ...                                                ...
29991      1  Love EW: I must admit that I am a total TV afi...
29992      1  Easy to follow and delicious recipes!: I compl...
29993      1  The Beauty and Mystery of Veronique: Perhaps t...
29994      1  I love it.: Brilliant, hilarious, quick and ea...
29995      0  broken...: bad choice...2d film would not play...

[29996 rows x 2 columns]


In [4]:
train_ratio = 0.8 # 80% for training, 20% for validation
random_seed = 100

train_dataframe = dataframe.sample(frac=train_ratio, random_state=random_seed)
valid_dataframe = dataframe.drop(train_dataframe.index)
print('training set size:', len(train_dataframe))
print('validation set size:', len(valid_dataframe))

training set size: 23997
validation set size: 5999


In [6]:
test_dataframe = pd.read_csv('./test.tsv', sep = '\t')
print (test_dataframe)

        id                                             review
0        1  Human Hurricane!: Would you like to sleep in t...
1        2  A Mom: I bought this with all kinds of expecta...
2        3  Good Read: I judge all books that I read by a ...
3        4  It's awesome: DVD set is exactly what you'd bu...
4        5  Great Movie!!!: This definatly the best Godzil...
...    ...                                                ...
5995  5996  Beautiful and Spiritual: This is a very beauti...
5996  5997  Another Cash In: This cd is pure dreck and it'...
5997  5998  Concept drawings-very good: The concept drawin...
5998  5999  I hear i all the time is awsome: this is great...
5999  6000  Not so great Performance: This mouse is very s...

[6000 rows x 2 columns]


# Try the trivial baseline: predict the majority label of the training set

In [7]:
Counter(train_dataframe['label'])

Counter({0: 11965, 1: 12032})

In [8]:
# Looks like label 1 has slightly more counts than label 0 in training data
# So the 'majority guess' prediction is an array filled with 1s
majority_guess_pred = [1 for i in range(len(valid_dataframe))]
accuracy = accuracy_score(valid_dataframe['label'], majority_guess_pred)
print ('Majority guess accuracy:', accuracy)

Majority guess accuracy: 0.5099183197199533


In [9]:
# helper function: write out prediction values into a csv format file
# params:
#     df: dataframe, where each row is a test example, with column 'id' as data id
#     pred: a list or 1-d array of prediction values
#     filepath: the output file path
# return:
#     None

def write_test_prediction(df, pred, filepath):
    with open(filepath, 'w') as outfile:
        outfile.write('{},{}\n'.format('id', 'label'))
        for index, row in df.iterrows():
            outfile.write('{},{}\n'.format(row['id'], pred[index]))
    print (len(df), 'predictions are written to', filepath)

In [10]:
majority_guess_pred_test = [1 for i in range(len(test_dataframe))]
write_test_prediction(test_dataframe, majority_guess_pred_test, './majority_guess.csv')

6000 predictions are written to ./majority_guess.csv


# Build feature extractor

## use all unigrams from training data as features

In [11]:
vectorizer = CountVectorizer()
vectorizer.fit(train_dataframe['review'])

CountVectorizer()

## Or: reuse the chi-square feature selection method from HW1 

In [12]:
import re

def process_text(text):
    for punctuations in [',', '.', '"', '!', '?', ':', ';', '-', '(', ')', '[', ']']:
        text = text.replace(punctuations, ' ')
    text = re.sub('\s+', ' ', text)
    text = text.lower().strip()
    return text

def get_single_word_frequency(filepath):
    word_freq = {}
    with open(filepath) as f:
        f.readline() # skip header (the first line) 
        for line in f:
            review_text = process_text(line.split('\t')[1])
            for word in review_text.split():
                if word not in word_freq:
                    word_freq[word] = 1
                else:
                    word_freq[word] += 1
    return word_freq

def get_single_word_doc_frequency_per_label(filepath, label):
    word_freq_per_label = {}
    with open(filepath) as f:
        f.readline() # skip header (the first line) 
        for line in f:
            sentiment_label = line.split('\t')[0].strip()
            if sentiment_label == label:
                review_text = process_text(line.split('\t')[1])
                for word in set(review_text.split()):
                    if word not in word_freq_per_label:
                        word_freq_per_label[word] = 1
                    else:
                        word_freq_per_label[word] += 1
    return word_freq_per_label

def feature_selection_chi2(review_filepath, num_features_to_select):
    num_reviews = 0
    num_positive_reviews = 0
    num_negative_reviews = 0
    with open(review_filepath) as f:
        f.readline() # skip header (the first line) 
        for line in f:
            num_reviews += 1
            if line.strip().split()[0] == '1':
                num_positive_reviews += 1
            else:
                num_negative_reviews += 1
    word_freq = get_single_word_frequency(review_filepath)
    positive_word_freq = get_single_word_doc_frequency_per_label(review_filepath, '1')
    negative_word_freq = get_single_word_doc_frequency_per_label(review_filepath, '0')
    
    chi2_dict = {}
    for word, freq in word_freq.items():
        if word in positive_word_freq and word in negative_word_freq:        
            contingency_table = np.zeros((2,2))
            contingency_table[0][0] = positive_word_freq[word]
            contingency_table[0][1] = negative_word_freq[word]
            contingency_table[1][0] = num_positive_reviews - positive_word_freq[word]
            contingency_table[1][1] = num_negative_reviews - negative_word_freq[word]

            chi2 = 0.0
            for i in range(2):
                for j in range(2):
                    expected_count = sum(contingency_table[i,:])*sum(contingency_table[:,j])/float(num_reviews)
                    chi2 += (contingency_table[i][j] - expected_count)**2 / expected_count

            chi2_dict[word] = chi2
    feature_set = set([])
    for word, chi2 in sorted(chi2_dict.items(), key = lambda x: x[1], reverse = True)[:num_features_to_select]:
        feature_set.add(word)
    return feature_set

In [20]:
num_features = 5000
feature_set = feature_selection_chi2('./train.tsv', num_features)
vectorizer = CountVectorizer(vocabulary = feature_set)
vectorizer.fit(train_dataframe['review'])

CountVectorizer(vocabulary={'#', '#1', '$', '$$$', '$1', '$10', '$13', '$14',
                            '$15', '$18', '$19', '$2', '$20', '$200', '$25',
                            '$250', '$29', '$3', '$30', '$300', '$39', '$4',
                            '$5', '$50', '$6', '$60', '$7', '$75', '$8', '$80', ...})

# Extract feature vectors for training, validation, and test data 

In [22]:
train_X = vectorizer.transform(train_dataframe['review'])
valid_X = vectorizer.transform(valid_dataframe['review'])
test_X = vectorizer.transform(test_dataframe['review'])
print (train_X.shape)
print (valid_X.shape)
print (test_X.shape)

(23997, 5000)
(5999, 5000)
(6000, 5000)


# Train model on training set

In [23]:
train_Y = train_dataframe['label']
model = LogisticRegression(C = 1, solver='liblinear')
model.fit(train_X, train_Y)

LogisticRegression(C=1, solver='liblinear')

# Evaluate model on training set

In [24]:
train_Y_hat = model.predict(train_X)
train_Y = train_dataframe['label'].to_numpy()
accuracy = accuracy_score(train_Y, train_Y_hat)
print ('Logistic regression, accuracy on training set:', accuracy)

Logistic regression, accuracy on training set: 0.9446597491353086


# Evaluate model on validation set

In [25]:
valid_Y_hat = model.predict(valid_X)
valid_Y = valid_dataframe['label'].to_numpy()
accuracy = accuracy_score(valid_Y, valid_Y_hat)
print ('Logistic regression, accuracy on validation set:', accuracy)

Logistic regression, accuracy on validation set: 0.8836472745457576


# After experimentation on the validation set: retrain the final model on all training data, and predict labels for test data

In [26]:
all_train_Y = dataframe['label']
all_train_X = vectorizer.transform(dataframe['review'])
model.fit(all_train_X, all_train_Y)
test_Y_hat = model.predict(test_X)
write_test_prediction(test_dataframe, test_Y_hat, './logistic_regression.csv')

6000 predictions are written to ./logistic_regression.csv


# Investigate what the model has learned and where it failed (A.K.A. error analysis)

## Look at learned parameters (for linear model: weight of each dimension)

In [27]:
# construct a mapping: word -> learned weight of this word
feature_weight = {}
for word, idx in vectorizer.vocabulary_.items():
    feature_weight[word] = model.coef_[0][idx]

In [28]:
# words correlated with positive sentiment (top ones)
for k, v in sorted(feature_weight.items(), key = lambda x: x[1], reverse = True)[:10]:
     print (k, v)

pleasantly 2.279402940495796
refreshing 2.205919739088226
worried 1.95732094393739
mars 1.9460025065988507
finest 1.8565518345759777
neat 1.7924083430328102
ch 1.7359182932270139
adorable 1.7190764107097123
gem 1.6675652240766863
pleased 1.6588436574745422


In [29]:
# words correlated with negative sentiments (top ones)
for k, v in sorted(feature_weight.items(), key = lambda x: x[1], reverse = False)[:10]:
     print (k, v)

disappointing -2.722746977631435
worthless -2.675360261781088
waste -2.541643888760075
worst -2.4806099150298264
overrated -2.40984309457248
alas -2.3494191985941586
eh -2.181271452099388
hopes -2.1732474653070297
disappointment -2.1527476608010305
disappointed -2.0185290153854245


## Look at how the model makes predictions on individual examples

In [30]:
# We pick a set of examples from the validation set (we predicted scores for those).
# We usually we don't pick from training data (since the good performance may be unrealistic).
# We cannot do error analysis on test data （because no true target value is provided）.

In [31]:
def explain_linear_prediction(df, model, idx2feature, X, Y, Y_hat, idx_list):
    print('indices:', idx_list)
    for idx in idx_list:
        print ('==============', idx, '================')
        print ('document:', df.iloc[idx]['review'])
        print ('TRUE label:', df.iloc[idx]['label'])
        print ('PRED label:', Y_hat[idx])
        
        print ('\nPRED breakdown:')
        print ('\tINTERCEPT', model.intercept_)
        if X[idx, :].nnz == 0:
            print ('\tFEATURE', '[EMPTY]')
        else:
            sp_row = X[idx, :]
            for i in range(sp_row.getnnz()): # looping over a row in sparse matrix 
                feature_value = sp_row.data[i]
                feature_dim = sp_row.indices[i]
                print ('\tFEATURE', idx2feature[feature_dim], ':', feature_value, '*', model.coef_[0][feature_dim])

In [32]:
# construct a dictionary mapping: feature index -> word
idx2feature = dict([(v,k) for k,v in vectorizer.vocabulary_.items()])

# look at data with prediction error
error_indices  = [i for i in range(len(valid_Y_hat)) if valid_Y_hat[i] != valid_Y[i]]
explain_linear_prediction(valid_dataframe, model, idx2feature, valid_X, valid_Y, valid_Y_hat, np.random.choice(error_indices, size = 1))

indices: [2245]
document: Was not working...: The product did recieve power, but after buying a XD card not an SD card, we found it did not work. If it had worked it would have been a perfect for our child who wanted his first camera.The person we purchased from was excellent though. Shipping was very fast, and he had no problems processing our return. He even gave us a goodwill credit for our troubles.
TRUE label: 0
PRED label: 1

PRED breakdown:
	INTERCEPT [0.09476012]
	FEATURE after : 1 * -0.1540410232075748
	FEATURE an : 1 * 0.024432241729830828
	FEATURE and : 1 * 0.1287635087116256
	FEATURE been : 1 * 0.11693673654496411
	FEATURE but : 1 * -0.058176839160452465
	FEATURE buying : 1 * -0.3735138160237207
	FEATURE card : 2 * 0.27507295795772446
	FEATURE child : 1 * -0.25639585399833775
	FEATURE credit : 1 * 0.13721045885374877
	FEATURE did : 2 * -0.03096315402592158
	FEATURE even : 1 * -0.3052928491885844
	FEATURE excellent : 1 * 1.5769780275912673
	FEATURE fast : 1 * 0.4778014922677