In [3]:
#! Python3
# useful libs
import numpy as np
import operator
import re
import collections
import nltk
from difflib import ndiff
import seaborn as sns
import numpy.linalg as la
import six

# Preprocessing
from gensim.utils import lemmatize

# vectorizers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import normalize

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

# (a) Data Import

In [4]:
# yelp_sentance, yelp_score = [], []
# for line in open('./data/yelp_labelled.txt'):
#     st, sc = line.split("\t")
#     yelp_sentance.append(unicode(st, "utf-8"))
#     yelp_score.append(sc[0:-1])
    
# amazon_sentance, amazon_score = [], []
# for line in open('./data/amazon_cells_labelled.txt'):
#     st, sc = line.split("\t")
#     amazon_sentance.append(st)
#     amazon_score.append(sc[0:-1])

# imdb_sentance, imdb_score = [], []
# for line in open('./data/imdb_labelled.txt'):
#     st, sc = line.split("\t")
#     imdb_sentance.append(st)
#     imdb_score.append(sc[0:-1])

sentances, scores = [], []
for file_name in ['yelp_labelled', 'amazon_cells_labelled', 'imdb_labelled']:    
    for line in open('./data/{}.txt'.format(file_name), 'r', encoding='utf-8'):
        st, sc = line.split("\t")
#         sentances.append(unicode(st, "utf-8"))
        sentances.append(st)
#         sentances.append(str(st.encode()))
        scores.append(sc[0:-1])

TypeError: 'encoding' is an invalid keyword argument for this function

In [6]:
counter = collections.Counter(scores)
# amazon_counter = collections.Counter(amazon_score)
# imdb_counter = collections.Counter(imdb_score)

# print("Yelp count(0): {}, count(1): {}".format(yelp_counter["0"], yelp_counter["1"]))
# print("Amazon count(0): {}, count(1): {}".format(amazon_counter["0"], amazon_counter["1"]))
# print("Imdb count(0): {}, count(1): {}".format(imdb_counter["0"], imdb_counter["1"]))
print("count(0): {}, count(1): {}".format(counter["0"], counter["1"]))

count(0): 1500, count(1): 1500


### Explanation
Yes the labels are balanced. 
By reading each line of the training txt files, we got lists of sentences and scores. 
Using collection.Counter, we get to know the number of each label in the score lists. 


# (b) Preprocessing

In [14]:
# Lowercase + strip punctuation + strip stopwords + lemmatization 
# yelp_np = [re.sub(r'[^\w\s]','',s) for s in yelp_nsw]
sentences_processed = []
for v in sentances:
    _v = lemmatize(v)    
    _v = [t.decode("utf-8").split('/')[0] for t in _v]
    sentences_processed.append(" ".join(_v))
# imdb_processed = [" ".join(lemmatize(v)) for v in imdb_sentance]
# amazon_processed = [" ".join(lemmatize(v)) for v in amazon_sentance]

### Preprocessing explanations
1) We should lowercase all of the words, because capitalized letters with make same words be treated as different ones.

2) We should strip punctuations because they do not contribute to sentiment.

3) Stop words are the most commonly occuring words which are not relevant in the context of the data and do not contribute any deeper meaning to the phrase. In this case contain no sentiment.

4) We should do lemmatization.This process finds the base or dictionary form of the word known as the lemma. This is done through the use of vocabulary (dictionary importance of words) and morphological analysis (word structure and grammar relations). This normalization is similar to stemming but takes into account the context of the word.

# (c) Split Training and Testing Set

In [15]:
# yelp_train_x = yelp_processed[:400]
# yelp_test_x = yelp_processed[400:]
# amazon_train_x = amazon_processed[:400]
# amazon_test_x = amazon_processed[400:]
# imdb_train_x = imdb_processed[:400]
# imdb_test_x = imdb_processed[400:]
train_x = sentences_processed[:400] + sentences_processed[500:900] + sentences_processed[1000:1400]
train_y = scores[:400] + scores[500:900] + scores[1000:1400]
test_x = sentences_processed[400:500] + sentences_processed[900:1000] + sentences_processed[1400:1500]
test_y = scores[400:500] + scores[900:1000] + scores[1400:1500]

## (d) Bag of Words
Why should we vectorize training set first and then go through testing set?<br/>
1) Here we should vectorize the training set standalone because testing set could contain words that are not contained in training set. <br/>
2) We will vectorize testing set based on the feature vector generated by training set. 

In [16]:
train_vectorizer = CountVectorizer()
# d.1. build a dictionary of unique words for training set
train_x_bag = train_vectorizer.fit_transform(train_x).todense()
test_vectorizer = CountVectorizer(vocabulary=train_vectorizer.get_feature_names())
test_x_bag = test_vectorizer.fit_transform(test_x).todense()
# d.2. Report feature vectors of 2 reviews
print(train_x[10])
print(train_x_bag[10])
print(train_x[0])
print(train_x_bag[0])

service be very prompt
[[0 0 0 ... 0 0 0]]
love place
[[0 0 0 ... 0 0 0]]


## (e) Postprocessing strategy
We choose L2 normalization as post-processing method, because:<br/>
1) L2 presents the inner product of a vector on itself, representing the length of the vector<br/>
2) The similarity between 2 vectors are calculated by their inner product, which is the format of L2<br/>
3) So L2 would be an ideal way to constrain the value range of each feature into (0, 1)

In [17]:
# post-processing
train_x_bag_normal = normalize(train_x_bag)
test_x_bag_normal = normalize(test_x_bag)

## (f) Sentiment prediction

In [18]:
def sentiment_prediction(Train_X, Train_Y, Test_X, Test_Y):
    # f.1 Logistic regression
    lr_clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(Train_X, Train_Y)
    lr_clf_score = lr_clf.score(Test_X, Test_Y)
    print("Logistic regression accuracy: {}".format(lr_clf_score))
    
    # f.2 Naive Bayes classifier
    # Gaussian
    gaussian_nb = GaussianNB()
    gaussian_nb.fit(Train_X, Train_Y)
    gaussian_nb_score = gaussian_nb.score(Test_X, Test_Y)
    print("Accuracy of Naive Bayes Classifier with Gaussian prior: {}".format(gaussian_nb_score))

    # Bernoulli
    b_nb = BernoulliNB()
    b_nb.fit(Train_X, Train_Y)
    b_nb_score = b_nb.score(Test_X, Test_Y)
    print("Accuracy of Naive Bayes Classifier with Bernoulli prior: {}".format(b_nb_score))
    
    return lr_clf_score, gaussian_nb_score, b_nb_score

In [19]:
sentiment_prediction(train_x_bag_normal, train_y, test_x_bag_normal, test_y)

Logistic regression accuracy: 0.78
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.7233333333333334
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.7233333333333334


(0.78, 0.7233333333333334, 0.7233333333333334)

### Comparison of classifiers:
Logistic regression model is slightly better than Naive Bayes classifiers. 

### Words playing the most important roles

In [20]:
vocabulary = train_vectorizer.vocabulary_
sorted_vocabulary = sorted(vocabulary.items(), key=operator.itemgetter(1), reverse=True)
print("The top 10 most important words: ")
for word in sorted_vocabulary[:10]:
    print("count({}) = {}".format(word[0], word[1]))

The top 10 most important words: 
count(yummy) = 1851
count(yum) = 1850
count(yukon) = 1849
count(yucky) = 1848
count(yet) = 1847
count(yelper) = 1846
count(yellowtail) = 1845
count(yellow) = 1844
count(year) = 1843
count(yama) = 1842


## (g) N-gram model

In [21]:
# Vectorize with 2-gram model
train_vectorizer_2gram = CountVectorizer(ngram_range=(2, 2))
# build a dictionary of unique words for training set
train_x_2gram = train_vectorizer_2gram.fit_transform(train_x).todense()
test_vectorizer_2gram = CountVectorizer(ngram_range=(2, 2), vocabulary=train_vectorizer_2gram.get_feature_names())
test_x_2gram = test_vectorizer_2gram.fit_transform(test_x).todense()
# Report feature vectors of 2 reviews
print(train_x[10])
print(train_x_2gram[10])
print(train_x[0])
print(train_x_2gram[0])

# post-processing
train_x_2gram_normal = normalize(train_x_2gram)
test_x_2gram_normal = normalize(test_x_2gram)

sentiment_prediction(train_x_2gram_normal, train_y, test_x_2gram_normal, test_y)
# # Logistic regression
# lr_clf_2gram = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(train_x_2gram_normal, train_y)
# lr_clf_2gram_score = lr_clf_2gram.score(test_x_2gram_normal, test_y)
# print("Logistic regression accuracy: {}".format(lr_clf_2gram_score))

# # Naive Bayes classifier
# # Gaussian
# gaussian_nb_2gram = GaussianNB()
# gaussian_nb_2gram.fit(train_x_2gram_normal, train_y)
# gaussian_nb_2gram_score = gaussian_nb_2gram.score(test_x_2gram_normal, test_y)
# print("Accuracy of Naive Bayes Classifier with Gaussian prior: {}".format(gaussian_nb_2gram_score))

# # Bernoulli
# b_nb_2gram = BernoulliNB()
# b_nb_2gram.fit(train_x_2gram_normal, train_y)
# b_nb_2gram_score = b_nb_2gram.score(test_x_2gram_normal, test_y)
# print("Accuracy of Naive Bayes Classifier with Bernoulli prior: {}".format(b_nb_2gram_score))

# Most important 2-gram words
vocabulary_2gram = train_vectorizer_2gram.vocabulary_
sorted_vocabulary_2gram = sorted(vocabulary_2gram.items(), key=operator.itemgetter(1), reverse=True)

important_words = []
for word in sorted_vocabulary_2gram[:10]:
    important_words.append(word[0])
    print("count({}) = {}".format(word[0], word[1]))

print("The top 10 most important 2-gram words: ")
print(important_words)

service be very prompt
[[0 0 0 ... 0 0 0]]
love place
[[0 0 0 ... 0 0 0]]
Logistic regression accuracy: 0.5833333333333334
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.7233333333333334
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.45
count(yummy tummy) = 5177
count(yummy have) = 5176
count(yum yum) = 5175
count(yum sauce) = 5174
count(yukon gold) = 5173
count(yet say) = 5172
count(yet run) = 5171
count(yet delicious) = 5170
count(yelper husband) = 5169
count(yellowtail carpaccio) = 5168
The top 10 most important 2-gram words: 
['yummy tummy', 'yummy have', 'yum yum', 'yum sauce', 'yukon gold', 'yet say', 'yet run', 'yet delicious', 'yelper husband', 'yellowtail carpaccio']


## (h) PCA for bag of words model

In [73]:
#Use SVD to peform PCA
p,n = np.shape(train_x_bag_normal)
cov_Mat = np.dot(train_x_bag_normal.T, train_x_bag_normal)/(p-1)
u, s, vh = np.linalg.svd(cov_Mat, full_matrices=True)

In [79]:
train_x_10 = np.dot(train_x_bag_normal, u[:,:10])
test_y_10 = np.dot(test_x_bag_normal, u[:,:10])
sentiment_prediction(train_x_10, train_y, test_x_10, test_y)

Logistic regression accuracy: 0.5633333333333334
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.5633333333333334
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.39


(0.5633333333333334, 0.5633333333333334, 0.39)

In [80]:
train_x_50 = np.dot(train_x_bag_normal, u[:,:50])
test_y_50 = np.dot(test_x_bag_normal, u[:,:50])
sentiment_prediction(train_x_50, train_y, test_x_50, test_y)

Logistic regression accuracy: 0.5
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.5333333333333333
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.43


(0.5, 0.5333333333333333, 0.43)

In [81]:
train_x_100 = np.dot(train_x_bag_normal, u[:,:100])
test_y_100 = np.dot(test_x_bag_normal, u[:,:100])
sentiment_prediction(train_x_100, train_y, test_x_100, test_y)

Logistic regression accuracy: 0.5033333333333333
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.5
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.44


(0.5033333333333333, 0.5, 0.44)

In [58]:
def meanX(dataX):
    return np.mean(dataX, axis=0)
def pca(XMat, k):
    average = meanX(XMat) 
    m, n = np.shape(XMat)
    data_adjust = []
    avgs = np.tile(average, (m, 1))
    data_adjust = XMat - avgs
    covX = np.cov(data_adjust.T)
    featValue, featVec=  np.linalg.eig(covX)
    index = np.argsort(-featValue)
    finalData = []
    if k > n:
        print("k must lower than feature number")
        return
    else:
        selectVec = np.matrix(featVec.T[index[:k]])
        finalData = data_adjust * selectVec.T 
        reconData = (finalData * selectVec) + average  
        finalData = finalData.astype('float64')
    return finalData, reconData

In [27]:
# Use pca of sklearn first, then switch back to our own PCA function
# from sklearn.decomposition import PCA
# pca_10 = PCA(n_components=10)
# train_x_10 = pca_10.fit_transform(train_x_bag_normal)
# test_x_10 = pca_10.fit_transform(test_x_bag_normal)

train_x_10, _recon_train = pca(train_x_bag_normal, 10)
test_x_10, _recon_test = pca(test_x_bag_normal, 10)
sentiment_prediction(train_x_10, train_y, test_x_10, test_y)



Logistic regression accuracy: 0.4666666666666667
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.5
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.42333333333333334


(0.4666666666666667, 0.5, 0.42333333333333334)

In [28]:
# PCA with 50 components
# pca_50 = PCA(n_components=50)
# train_x_50 = pca_50.fit_transform(train_x_bag_normal)
# test_x_50 = pca_50.fit_transform(test_x_bag_normal)
train_x_50, _recon_train = pca(train_x_bag_normal, 50)
test_x_50, _recon_test = pca(test_x_bag_normal, 50)
sentiment_prediction(train_x_50, train_y, test_x_50, test_y)



Logistic regression accuracy: 0.43666666666666665
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.47
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.44666666666666666


(0.43666666666666665, 0.47, 0.44666666666666666)

In [29]:
# PCA with 100 components
# pca_100 = PCA(n_components=100)
# train_x_100 = pca_100.fit_transform(train_x_bag_normal)
# test_x_100 = pca_100.fit_transform(test_x_bag_normal)
train_x_100, _recon_train = pca(train_x_bag_normal, 100)
test_x_100, _recon_test = pca(test_x_bag_normal, 100)
sentiment_prediction(train_x_100, train_y, test_x_100, test_y)



Logistic regression accuracy: 0.44666666666666666
Accuracy of Naive Bayes Classifier with Gaussian prior: 0.5266666666666666
Accuracy of Naive Bayes Classifier with Bernoulli prior: 0.43666666666666665


(0.44666666666666666, 0.5266666666666666, 0.43666666666666665)

## (i) Algorithm comparison and analysis
1) Bag of words using logistic regression performs best. The accuracy is 0.78. It might be because bag of words reserved all features of words and single word could represent features better. <br/>
2) The words such as "yummy", "delicious" plays important role in representing sentiment users' attitudes in comments. <br/>
3) The reason that PCA did not work well for this dataset might be because:
  * The word features are relatively evenly distributed in all features, which means the directions with highest variance cannot represent most information of the original dataset. So reducing dimensions will lose considerable part of original information. 
  * Originally the dataset has around 2000 features. Reducing them to ~100 features reduced too much information.
  

----------------------------

#### Test code

In [105]:
#lowercase
yelp_sentance = [x.lower() for x in yelp_sentance]
amazon_sentance = [x.lower() for x in amazon_sentance]
imdb_sentance = [x.lower() for x in imdb_sentance]

In [107]:
# Strip punctuation
yelp_word_vectors = [re.compile('\w+').findall(x) for x in yelp_sentance]
amazon_word_vectors = [re.compile('\w+').findall(x) for x in amazon_sentance]
imdb_word_vectors = [re.compile('\w+').findall(x) for x in imdb_sentance]

In [109]:
#strip the stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
# stop_words.remove('not')
yelp_no_stopwords = [list(set(vector) - stop_words) for vector in yelp_word_vectors]
amazon_no_stopwords = [list(set(vector) - stop_words) for vector in amazon_word_vectors]
imdb_no_stopwords = [list(set(vector) - stop_words) for vector in imdb_word_vectors]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/wrymax/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
