In [None]:
import numpy as np
import pandas as pd

from gensim.models.word2vec import Word2Vec, KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.snowball import EnglishStemmer
from bs4 import BeautifulSoup

In [None]:
MODEL_DIM = 300
count_analyzer = CountVectorizer().build_analyzer()
tfidf_analyzer = TfidfVectorizer().build_analyzer()
stemmer = EnglishStemmer()

def preprocessor(review):
    return BeautifulSoup(review, 'html5lib').get_text()

def stem_analyzer(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

def get_vectors(reviews, vocabulary):
    X = []
    for review in reviews:
        num = 0
        review_vector = np.zeros(MODEL_DIM)

        for i in range(len(review)):
            weight = review[i]
            if weight == 0:
                continue

            word = vocabulary[i]
            if not model.vocab.has_key(word):
                continue
            
            vector = model[word]
            review_vector += vector * weight
            num = num + 1

        if num > 0:
            review_vector = review_vector / num

        X.append(review_vector)
    return X

In [None]:
model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)
train = pd.read_csv("labeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("testData.tsv", header=0, delimiter="\t", quoting=3)

In [None]:
count_v = CountVectorizer(analyzer=count_analyzer, preprocessor=preprocessor, stop_words='english', max_features=5000)
tfidf_v = TfidfVectorizer(analyzer=tfidf_analyzer, preprocessor=preprocessor, stop_words='english', max_features=5000)

train_reviews = [r for r in train.review]
test_reviews = [r for r in test.review]

count_v.fit(train_reviews)
tfidf_v.fit(train_reviews)

train_reviews_count_trans = count_v.transform(train_reviews).toarray()
train_reviews_tfidf_trans = tfidf_v.transform(train_reviews).toarray()

test_reviews_count_trans = count_v.transform(test_reviews).toarray()
test_reviews_tfidf_trans = tfidf_v.transform(test_reviews).toarray()

count_vocabulary = count_v.get_feature_names()
tfidf_vocabulary = tfidf_v.get_feature_names()

X_all_count = get_vectors(train_reviews_count_trans, count_vocabulary)
X_all_tfidf = get_vectors(train_reviews_tfidf_trans, tfidf_vocabulary)

X_test_count = get_vectors(test_reviews_count_trans, count_vocabulary)
X_test_tfidf = get_vectors(test_reviews_tfidf_trans, tfidf_vocabulary)

y_all = train.sentiment

In [None]:
#np.savez_compressed("word2vec_count_vectorized_data.npz", X_all=X_all_count, X_test=X_test_count, y_all = y_all)
#np.savez_compressed("word2vec_tfidf_vectorized_data.npz", X_all=X_all_tfidf, X_test=X_test_tfidf, y_all = y_all)

data = np.load('word2vec_count_vectorized_data.npz')
X_all, X_test, y_all = data['X_all'], data['X_test'], data['y_all']

# Classify

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
# 比较高维的时候，LogisticRegression这样简单的算法竟然效果是最好，RandomForestClassifier表现比较差
#from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
#from xgboost import XGBClassifier
from sklearn.cluster import KMeans, MiniBatchKMeans

```
X_all_std = StandardScaler().fit_transform(X_all)
```
不用Bag-of-centroid的时候，std有效果

In [None]:
kmeans = KMeans(n_clusters=5000, max_iter=10, n_jobs=8)
kmeans.fit(X_all)
X_all_km_centroids = [kmeans.cluster_centers_[idx] for idx in kmeans.labels_]

In [None]:
minibatchkmeans = MiniBatchKMeans(n_clusters=5000, max_iter=10)
minibatchkmeans.fit(X_all)
X_all_mbkm_centroids = [minibatchkmeans.cluster_centers_[idx] for idx in minibatchkmeans.labels_]

In [None]:
cross_val_score(LogisticRegression(), X_all, y_all) #works best
cross_val_score(RidgeClassifier(), X_all, y_all)
cross_val_score(SVC(), X_all, y_all)

cross_val_score(LogisticRegression(), X_all_km_centroids, y_all)
cross_val_score(LogisticRegression(), X_all_mbkm_centroids, y_all)