# 在word2vec上训练情感分析模型

In [5]:
import os 
import re
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

from nltk.corpus import stopwords

from gensim.models.word2vec import Word2Vec

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.cluster import KMeans

In [14]:
def load_dataset(name, nrows=None):
    datasets = {
        'unlabeled_train': 'unlabeledTrainData.tsv',
        'labeled_train': 'labeledTrainData.tsv',
        'test': 'testData.tsv'
    }
    if name not in datasets:
        raise ValueError(name)
    data_file = os.path.join('.', 'data', datasets[name])
    df = pd.read_csv(data_file, sep='\t', escapechar='\\', nrows=nrows)
    print('Number of reviews: {}'.format(len(df)))
    return df

In [6]:
eng_stopwords = set(stopwords.words('english'))

def clean_text(text, remove_stopwords=False):
    text = BeautifulSoup(text, 'html.parser').get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.lower().split()
    if remove_stopwords:
        words = [w for w in words if w not in eng_stopwords]
    return words

# 读入训练好的word2vec模型

In [12]:
model_name = '300features_10minwords_10context.model'
model = Word2Vec.load(os.path.join('.','model',model_name))
model

<gensim.models.word2vec.Word2Vec at 0x104eb0f50>

## 根据word2vec的结果多影评文本进行编码

In [15]:
df = load_dataset('labeled_train')
df.head()

Number of reviews: 25000


Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"""The Classic War of the Worlds"" by Timothy Hin..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [16]:
# 这里的做法是把每条评论的所有单词的词向量家和取均值，naive
def to_review_vector(review):
    words = clean_text(review)
    array = np.array([model[w] for w in words if w in model])
    return pd.Series(array.mean(axis = 0))

In [17]:
train_data_features = df.review.apply(to_review_vector)
train_data_features.head()

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,-0.013136,-0.008601,0.008102,0.00232,-0.013041,0.004052,-0.008471,0.006639,0.014264,-0.019703,...,-0.003595,0.006053,-0.002416,-0.008749,-0.009902,0.01123,0.006375,-0.015609,0.009819,0.007952
1,-0.015347,0.002005,-0.000907,-0.00754,-0.012829,0.003942,-0.016349,0.011501,0.017876,-0.02788,...,-0.0023,0.010388,-0.005939,0.000633,0.002901,0.008876,0.00496,-0.010098,0.00471,0.00021
2,-0.016728,0.002448,0.000345,-0.002283,-0.006208,0.013001,-0.009454,0.013228,0.026653,-0.015238,...,0.008515,0.001848,-0.018959,0.011622,0.001687,0.007134,-0.016329,0.004105,0.000528,0.002606
3,-0.014231,-0.003985,0.004035,0.012628,-0.002745,0.003662,-0.018115,0.003086,0.01475,-0.028962,...,0.003085,0.005768,-0.017374,0.009206,0.009195,0.012355,-0.008875,-0.020016,0.004525,0.006851
4,-0.023095,-0.005043,0.006312,0.00903,-0.001652,0.009097,-0.013841,0.008905,0.018002,-0.01131,...,0.005649,0.004414,-0.01191,0.003434,-0.012384,0.004966,-0.003566,-7.3e-05,0.005456,0.008788


现在，每条评论都由了一个词向量代表（如果是短文本的话，应该效果还可以）

## 构建分类器

In [18]:
forest = RandomForestClassifier(n_estimators=100,random_state=20)
forest = forest.fit(train_data_features,df.sentiment)

In [19]:
# 测试一下模型是否可用
confusion_matrix(df.sentiment,forest.predict(train_data_features))

array([[12500,     0],
       [    0, 12500]])