# Sentiment analysis of online reviews

##a Parse dataset

In [72]:
import numpy as np
import pandas as pd

def read_data(file_path):
    res = []
    with open(file_path, 'r') as f:
        for line in f.readlines():
            now = line.strip().split('\t')
            if len(now) == 2:
                res.append({'sentence' : now[0], 'label' : int(now[1])})
                 
    return pd.DataFrame(res)
    
amazon_data = read_data('./sentiment labelled sentences/amazon_cells_labelled.txt')
imdb_data = read_data('./sentiment labelled sentences/imdb_labelled.txt')
yelp_data = read_data('./sentiment labelled sentences/yelp_labelled.txt')

In [2]:
print amazon_data.head()
def label_info(data):
    l_0, l_1 = len(data[data.label == 0]), len(data[data.label == 1])
    print "label 0 count: " + str(l_0)
    print "label 1 count: " + str(l_1)
    print "ratio: " + str((l_0 * 1.0 / l_1))
    
label_info(amazon_data)

   label                                           sentence
0      0  So there is no way for me to plug it in here i...
1      1                        Good case, Excellent value.
2      1                             Great for the jawbone.
3      0  Tied to charger for conversations lasting more...
4      1                                  The mic is great.
label 0 count: 500
label 1 count: 500
ratio: 1.0


In [3]:
label_info(imdb_data)

label 0 count: 500
label 1 count: 500
ratio: 1.0


In [4]:
label_info(yelp_data)

label 0 count: 500
label 1 count: 500
ratio: 1.0


So all three datasets' labels are well balanced.

## b Data Preprocessing

In [5]:
print amazon_data['sentence'].head()

0    So there is no way for me to plug it in here i...
1                          Good case, Excellent value.
2                               Great for the jawbone.
3    Tied to charger for conversations lasting more...
4                                    The mic is great.
Name: sentence, dtype: object


The sentences contain both uppercase letters and lowercase letter, we need to convert uppcase letters to lowercase because 'Good' has the same meaning as 'good'.

In [6]:
def lowercase(sentence):
    return sentence.lower()

Removing punctuation also helps denoising data.

In [7]:
import re
punct = re.compile(r'([^A-Za-z0-9 ])')
def remove_punc(sentence):
    return punct.sub("", sentence)


Stemming and lemmatization is also necessary, because 'has' and 'have' have same meaning. Here I am using nltk's Porter stemmer to do stem.
Removing stop words help us better concentrate on features that 'matters', since words like "and", "or" make no contribution to classify a sentence.
P.S YOU should download corpus by running `nltk.download()` first.

In [8]:
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
#nltk.download()
    

stop = stopwords.words('english')
stemmer = PorterStemmer()
def stemming_and_remove_stop(sentence):
    return ' '.join([stemmer.stem(word) for word in sentence.split() if word not in stop])


Now make all the functions together.

In [9]:
def preprocess(data):
    strs = list(data['sentence'])
    def preprocess_one_line(sent):
        return stemming_and_remove_stop(remove_punc(lowercase(sent)))
    return [preprocess_one_line(line) for line in strs]

amazon_data['sentence'] = preprocess(amazon_data)
yelp_data['sentence'] = preprocess(yelp_data)
imdb_data['sentence'] = preprocess(imdb_data)

## c Split training and testing data set

In [10]:
def extract_train(data):
    data_label_1, data_label_0 = data[data.label == 1], data[data.label == 0]
    
    train_dt_1, test_dt_1 = data_label_1[:400], data_label_1[400:]
    train_dt_0, test_dt_0 = data_label_0[:400], data_label_0[400:]
    return train_dt_1.append(train_dt_0, ignore_index = True), test_dt_1.append(test_dt_0, ignore_index = True)

amz_train, amz_test = extract_train(amazon_data)
yp_train, yp_test = extract_train(yelp_data)
imdb_train, imdb_test = extract_train(imdb_data)

train = amz_train.append(yp_train, ignore_index=True).append(imdb_train, ignore_index=True)
test = amz_test.append(yp_test, ignore_index=True).append(imdb_test, ignore_index=True)
print len(train), len(test)

2400 600


## d Bag of Words model

In [11]:
word_dict, word_list = {}, []
#print train['sentence']

def extract_word_pos(data):
    pos = 0
    for sentence in data['sentence']:
        for word in sentence.split(' '):
            if not word in word_dict:
                word_dict[word] = pos
                pos += 1
                word_list.append(word)

extract_word_pos(train)

The reason why we can't use testing set is that we can not use any information from testing set as it is served as a standalone testing criteria. If we are using word from testing set, then we might overfit the model.

In [12]:
def build_feature(data):
    rows, cols = len(data['sentence']), len(word_dict)
    res = np.zeros((rows,cols), dtype = np.int)
    
    for index, sentence in enumerate(data['sentence']):
        for word in sentence.split(' '):
            if word in word_dict:
                res[index][word_dict[word]] += 1
    
    return res

In [13]:
train_feat = build_feature(train)
test_feat = build_feature(test)
train_label, test_label = train['label'], test['label']

In [14]:
np.set_printoptions(threshold='nan', linewidth='100')
print train_feat[0]

[1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [15]:
print train_feat[1]

[0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

From the printing we can see, the feature vectors are very sparse, only a few entries have positive values while most of the entries are filled with 0.

## e Postprocessing strategy

In [16]:
from sklearn import preprocessing

def normalize(data, norm = 'l2'):
    return preprocessing.normalize(data.astype(np.float64), norm = norm)

train_feat = normalize(train_feat)
test_feat = normalize(test_feat)


Log-normalization is useful when converting large data range to small data range while here data's range is quite small, so log-norm is not suitable. Standardizing data is also not useful because the underlying assumption of standardization is the data has a gaussian distribution, which is clearly not true for short texts. L1 normalization only keep the information of relative proportion, for example, after l1 norm, (10, 5) and (2,1) both become (2/3, 1/3), so we lose the information on absolute numbers, which could be useful.

## f Training set clustering

In [17]:
def find_k_centroids(seeds, features, K = 2):
    def isEnd(prev_centers, now_centers):
        for idx in range(len(prev_centers)):
            prev_center, now_center = prev_centers[idx], now_centers[idx]
            if not np.allclose(prev_center, now_center,rtol=1e-4,atol=1e-6):
                return False
        return True
    
    center_matrices = assign(seeds, features)
    
    #print center_matrices[0].shape
    #print center_matrices[1].shape
    prev_centers, now_centers = seeds, [np.mean(matrix, axis=0) for matrix in center_matrices]
    
    time = 1
    while not isEnd(prev_centers, now_centers):
        print "iter " + str(time)
        time += 1
        center_matrices = assign(now_centers, features)
        prev_centers = now_centers
        now_centers = [matrix.mean(0) for matrix in center_matrices]
        
    labels = np.zeros((features.shape[0],1), dtype=np.int)
    
    for index, point in enumerate(features):
        idx = find_closest_center(now_centers, point)
        labels[index, 0] = idx
        
    return now_centers, labels
        
def assign(centers, points):
    
    res = [None for _ in range(len(centers))]
    
    for point in points:
        idx = find_closest_center(centers, point)
        if res[idx] is None:
            res[idx] = point
        else:
            res[idx] = np.vstack((res[idx], point))
            
    return res

def find_closest_center(centers, arr):
    res_index, dist = -1, float('inf')
    
    for index,center in enumerate(centers):
        now_dist = l2_distance(center, arr)
        if now_dist < dist:
            res_index, dist = index, now_dist
            
    return res_index

def l2_distance(a, b):
    return np.linalg.norm(a - b)

In [18]:
from sklearn.cluster import k_means
np.random.seed(50)

i1,i2 = np.random.choice(train_feat.shape[0],2, replace=False)
start_points = train_feat[[i1,i2]]
centers, labels = find_k_centroids(start_points, train_feat)

iter 1
iter 2
iter 3
iter 4
iter 5
iter 6
iter 7
iter 8
iter 9


In [20]:
np.set_printoptions(precision=2)
print centers[0]
#k_means(test_pointsz, 2)

[  3.42e-02   4.52e-03   8.76e-03   1.58e-03   1.09e-03   4.57e-04   2.57e-04   9.44e-04   2.57e-04   2.52e-03   6.52e-03   9.70e-03   3.58e-03   1.37e-02   1.40e-03   7.13e-03   2.98e-04   2.78e-03   3.64e-03   1.06e-02   1.66e-02   7.28e-04   3.06e-04   2.24e-02   2.82e-03   1.59e-02   1.73e-03   1.18e-02   1.82e-04   2.87e-04   3.06e-03   1.43e-02   8.61e-04   1.39e-03   4.16e-03   3.19e-03   1.60e-03   1.92e-03   1.19e-03   3.17e-03   1.80e-03   2.46e-03   4.11e-04   2.97e-04   3.09e-04   7.84e-03   5.53e-04   3.18e-04   1.70e-03   4.49e-03   1.15e-02   3.42e-04   7.75e-03   1.68e-04   3.36e-03   1.72e-02   1.68e-04   4.19e-03   3.03e-04   2.67e-03   2.41e-03   1.31e-02   1.19e-04   9.60e-04   7.35e-03   1.99e-03   7.02e-03   1.19e-04   1.33e-03   2.63e-03   0.00e+00   2.55e-03   0.00e+00   0.00e+00   7.41e-03   1.84e-03   1.41e-03   7.54e-03   7.24e-03   4.72e-03   2.51e-03   1.81e-03   5.36e-03   1.67e-03   6.97e-03   9.37e-04   1.99e-04   1.34e-04   6.81e-04   2.24e-03   2.01e-0

In [21]:
print centers[1]

[ 0.02  0.01  0.    0.01  0.51  0.    0.    0.    0.    0.    0.01  0.02  0.    0.    0.    0.01  0.    0.    0.01  0.01  0.    0.    0.    0.06  0.    0.08  0.    0.01  0.    0.    0.    0.01  0.    0.    0.01  0.    0.    0.01  0.    0.    0.    0.    0.    0.    0.    0.01  0.    0.    0.    0.    0.    0.    0.02  0.    0.01  0.    0.    0.    0.    0.    0.01  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.01  0.    0.    0.01  0.03  0.    0.    0.    0.    0.    0.01  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.01  0.    0.    0.    0.01  0.    0.    0.    0.    0.    0.01  0.    0.    0.    0.    0.01  0.01  0.    0.    0.01  0.    0.    0.    0.    0.01  0.    0.    0.    0.01  0.01  0.    0.    0.    0.    0.    0.01  0.    0.    0.    0.01  0.    0.01  0.    0.    0.    0.    0.01  0.01  0.    0.    0.    0.    0.    0.    0.01  0.    0.    0.    0.    0.    0.    0.    0.    0.    0.    0.

In [22]:
from sklearn.metrics import accuracy_score

accuracy_score(train_label, labels)

0.5591666666666667

The classification accuracy is 0.56, only slightly better than random guess(0.5).

## g Sentiment prediction

In [23]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(train_feat, train_label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [24]:
model.score(test_feat, test_label)

0.80666666666666664

The logistic regression classification accuracy is 0.807.

In [25]:
from sklearn.metrics import confusion_matrix
confusion_matrix(test_label, model.predict(test_feat))

array([[258,  42],
       [ 74, 226]])

Confusion matrix is shown above.

In [26]:
coef = model.coef_
for pos in coef.argsort()[0,-10:]:
    print word_list[pos]

beauti
fantast
best
good
amaz
delici
nice
excel
love
great


From the printing result we can see, strong positive emotional words play a great role in determining labels.

## h N-gram model

### build feature vectors

In [27]:
n_gram_dict, n_gram_list = {}, []

def n_gram_feature(data, n = 2):
    pos = 0
    
    for sentence in data['sentence']:
        tokens = sentence.split(' ')
        for index in range(len(tokens) - n + 1):
            now_gram = ' '.join(tokens[index : (index + n)])
            if not now_gram in n_gram_dict:
                n_gram_dict[now_gram] = pos
                n_gram_list.append(now_gram)
                pos += 1

n_gram_feature(train)

In [28]:
def build_n_gram_feature(data, n = 2):
    rows, cols = len(data['sentence']), len(n_gram_dict)
    res = np.zeros((rows,cols), dtype = np.int)
    
    for index, sentence in enumerate(data['sentence']):
        tokens = sentence.split(' ')
        for token_idx in range(len(tokens) - n + 1):
            now_gram = ' '.join(tokens[token_idx : (token_idx + n)])
            if now_gram in n_gram_dict:
                res[index][n_gram_dict[now_gram]] += 1
    
    return res
    

In [29]:
train_2_gram_feat = build_n_gram_feature(train)
test_2_gram_feat = build_n_gram_feature(test)

### Postprocessing data

In [30]:
train_2_gram_feat = normalize(train_2_gram_feat)
test_2_gram_feat = normalize(test_2_gram_feat)

### training set clustering

In [31]:
i1,i2 = np.random.choice(train_2_gram_feat.shape[0],2, replace=False)
start_points = train_2_gram_feat[[i1,i2]]
centers, labels = find_k_centroids(start_points, train_2_gram_feat)

iter 1


In [33]:
accuracy_score(train_label, labels)

0.50291666666666668

The clustering accuracy is bad, almost same like random guess.

### Sentiment prediction

In [34]:
model = LogisticRegression()
model.fit(train_2_gram_feat, train_label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [35]:
model.score(test_2_gram_feat, test_label)

0.64166666666666672

The logistic regression classification accuracy based on 2-gram is 0.642. Not as good as 1-gram, possible reason could be 2-gram is more sparse than 1-gram thus lack generalization.

In [36]:
confusion_matrix(test_label, model.predict(test_2_gram_feat))

array([[266,  34],
       [181, 119]])

In [37]:
coef = model.coef_
for pos in coef.argsort()[0,-10:]:
    print n_gram_list[pos]

good price
realli good
great food
easi use
food good
great product
great phone
one best
highli recommend
work great


From the printing result we can see, 2-grams having strong positive emotional word can be a good feature.

## PCA for bag of words model

In [38]:
def pca(data, n = 10):
    U, s, V = np.linalg.svd(data, full_matrices=True)
    delta = np.diag(s[:n])

    return np.dot(U[:,:n],delta)

In [39]:
test11 = np.array([[1.9,2.5,3.3], 
                   [2.2,3.1,4.8], 
                   [2.1,0.1,5.4]])
#train_pca_10_feat = pca(train_feat, 10)

In [40]:
def center(data):
    mean = np.mean(data, axis=0)
    return data - mean


In [56]:
train_pca_10_feat = pca(train_feat, 10)
test_pca_10_feat = pca(test_feat, 10)

In [57]:
def find_start_points(feat, n = 2):
    i1,i2 = np.random.choice(feat.shape[0],2, replace=False)
    start_points = feat[[i1,i2]]

    return start_points

start_points = find_start_points(train_pca_10_feat)
centers, labels = find_k_centroids(start_points, train_pca_10_feat)

iter 1
iter 2
iter 3
iter 4
iter 5
iter 6
iter 7
iter 8
iter 9
iter 10
iter 11
iter 12
iter 13
iter 14
iter 15
iter 16
iter 17
iter 18
iter 19


In [58]:
print centers

[array([ 0.09, -0.05, -0.07, -0.04,  0.2 , -0.06,  0.01, -0.03, -0.12,  0.09]), array([ 0.08, -0.03, -0.05,  0.01,  0.  , -0.01, -0.  , -0.  ,  0.02, -0.01])]


In [59]:
accuracy_score(train_label, labels)

0.52541666666666664

The k-means result on data reducing  dimensionality to 10 only slightly better than random guess. 

In [60]:
model = LogisticRegression()
model.fit(train_pca_10_feat, train_label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

In [61]:
model.score(test_pca_10_feat, test_label)

0.54166666666666663

In [62]:
train_pca_50_feat = pca(train_feat, 50)
test_pca_50_feat = pca(test_feat, 50)

In [63]:
start_points = find_start_points(train_pca_50_feat)
centers, labels = find_k_centroids(start_points, train_pca_50_feat)

iter 1
iter 2
iter 3
iter 4


In [64]:
print centers

[array([  4.65e-01,   2.12e-01,   8.69e-02,  -8.35e-02,  -2.55e-02,   1.77e-02,  -3.04e-02,  -2.47e-02,  -1.22e-02,   9.18e-03,  -8.76e-03,  -2.28e-03,   2.77e-03,  -7.11e-03,  -8.07e-03,  -9.17e-03,   4.17e-03,  -3.08e-03,   8.32e-04,  -1.30e-03,  -3.99e-03,  -1.40e-03,   4.30e-03,  -8.90e-03,  -1.66e-03,   5.20e-04,  -4.12e-04,   6.97e-04,  -2.09e-03,  -4.76e-04,  -7.15e-04,   2.16e-03,  -1.05e-03,   2.82e-03,   2.12e-03,   9.42e-04,   2.74e-03,   1.93e-03,  -1.51e-03,   1.32e-03,  -5.55e-04,   5.10e-04,   2.01e-03,   1.21e-03,   1.04e-03,   9.58e-04,   1.07e-03,  -1.05e-03,   1.57e-03,   1.08e-03]), array([  5.75e-02,  -4.48e-02,  -6.14e-02,   1.13e-02,   2.47e-02,  -2.15e-02,  -6.72e-04,  -2.12e-03,   4.88e-03,   2.14e-03,   1.61e-03,  -5.62e-03,   1.89e-03,   2.21e-03,   1.01e-03,  -7.19e-03,  -5.31e-03,   1.10e-02,   9.49e-04,  -5.20e-04,  -4.75e-03,   2.26e-03,  -4.77e-03,  -3.13e-03,   3.49e-03,  -6.55e-03,   9.33e-03,   4.13e-03,  -6.75e-04,  -1.21e-03,   3.94e-05,  -1.26e-03,

In [65]:
accuracy_score(train_label, labels)

0.44124999999999998

The accurancy is 0.56(1 - 0.44).

In [66]:
model = LogisticRegression()
model.fit(train_pca_50_feat, train_label)
model.score(test_pca_50_feat, test_label)

0.52666666666666662

In [68]:
train_pca_100_feat = pca(train_feat, 100)
test_pca_100_feat = pca(test_feat, 100)
start_points = find_start_points(train_pca_100_feat)
centers, labels = find_k_centroids(start_points, train_pca_100_feat)

iter 1


In [69]:
print centers

[array([  2.47e-01,   7.28e-02,  -5.90e-02,   1.06e-01,  -7.06e-02,  -1.55e-01,   3.03e-01,   8.47e-02,   3.10e-02,  -3.29e-02,   7.00e-03,   2.19e-02,  -1.61e-03,   3.42e-02,   8.29e-03,   5.78e-02,   5.63e-03,  -2.36e-02,   9.74e-03,   4.76e-04,   3.80e-02,  -2.05e-02,  -3.40e-02,   8.93e-03,   1.25e-02,   9.39e-04,   1.91e-03,  -3.18e-03,   2.29e-03,   1.98e-03,  -2.10e-03,  -1.25e-04,  -1.16e-03,  -5.07e-05,  -8.95e-03,   8.17e-04,  -1.41e-03,   3.13e-04,  -7.16e-04,   1.66e-03,  -2.31e-03,  -2.43e-03,   5.13e-05,   1.91e-03,  -2.11e-03,   2.10e-04,  -2.68e-03,  -9.99e-04,  -2.68e-03,   5.46e-04,  -1.65e-03,   1.01e-03,  -2.84e-03,  -3.33e-04,  -1.15e-04,   6.60e-04,   6.45e-04,   2.57e-03,  -2.36e-04,   9.60e-04,  -4.16e-04,   1.00e-03,  -2.09e-03,  -7.83e-04,   1.35e-03,  -1.63e-03,  -4.84e-04,   9.65e-04,  -6.26e-04,  -5.52e-04,   3.47e-04,   9.44e-04,  -1.33e-03,  -2.77e-03,  -6.34e-05,   1.68e-03,  -1.29e-03,  -8.26e-04,  -2.43e-04,   1.48e-03,  -1.94e-03,  -7.28e-04,  -2.47e-

In [71]:
sc = accuracy_score(train_label, labels)
print max(sc, 1 - sc)

0.51


## j Algorithms comparison and analysis

Bag of words representation features combined with using logistic regression achieves best performance. K-means result is around 50% whether using bag of words representation, 2-gram or pca for bag-of-words. The reason why pca for bag-of-words combined with logistic regression is not as good as raw bag-of-words features could be that pca requires centerized data, but the absolute number of each word could be the good indicator; and also, maybe 10,50,100 dimensions is relatively small compared to 3673 original dimensions.