## 使用机器学习进行情感分析
### 获取数据

In [3]:
%matplotlib inline

# dataset: http://ai.stanford.edu/~amaas/data/sentiment/
import pyprind
import pandas as pd
import os

# pbar = pyprind.ProgBar(50000)
# labels = {'pos':1,  'neg':0}
# df = pd.DataFrame()
# for s in ('test', 'train'):
#     for l in ('pos', 'neg'):
#         path = './aclImdb/%s/%s' % (s, l)
#         for file in os.listdir(path):
#             with open(os.path.join(path, file), 'r',encoding="utf-8") as infile:
#                 txt = infile.read()
#             df = df.append([[txt, labels[l]]], ignore_index=True)
#             pbar.update()
# df.columns = ['review', 'sentiment']

In [4]:
import numpy as np
# 读取的数据类标是经过排序的，并且出于方便使用，我们将数据打乱顺序然后保存到CSV文件中。
# np.random.seed(0)
# df = df.reindex(np.random.permutation(df.index))
# df.to_csv('./movie_data.csv', index=False)

In [5]:
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,"Election is a Chinese mob movie, or triads in ...",1
1,I was just watching a Forensic Files marathon ...,0
2,Police Story is a stunning series of set piece...,1


### 词袋模型简介

In [30]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# ngram_range设置n元祖模型，默认是1元祖
# count = CountVectorizer(ngram_range=(2, 2))
count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining and the weather is sweet'
    ])
# 创建词袋模型的词汇库，并将上面3个句子转换成稀疏的特征向量。
bag = count.fit_transform(docs)

In [31]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [32]:
# 特征向量的每个索引位置与上面存储的整数值对应。例如，索引0为and只在第三个句子中出现过，索引1的is在3个句子中都出现过。
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


#### tf-idf说明
df(t, d) 为包含词汇t的文档d的数量  
$ n_{d}$ 为文档的总数  
$idf(t, d) = log\frac{n_{d}}{1 + df(d, t))}$  

sklearn中使用的tf-idf公式为   
tf-idf(t, d) = tf(t, d) * (idf(t, d) + 1)  

In [33]:
from sklearn.feature_extraction.text import TfidfTransformer
# 将上面的词频转换成tf-idf
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [52]:
df.loc[0, 'review'][-50:]
# 可以看到原始数据中包含很多特殊字符，还有html标签

'nd three more acting performances (including Yam).'

In [53]:
import re
def preprocessor(text):
    # 对数据进行处理，使用正则表达式去掉所有html标签
    text = re.sub('<[^>]*>', '', text)
    # 寻找所有的表情符
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # 删除所有非单词字符，并将所有文字转换成小写，将表情符加在文末
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

In [54]:
preprocessor(df.loc[0, 'review'])[-50:]

' and three more acting performances including yam '

In [55]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [56]:
# 替换所有评论
df['review'] = df['review'].apply(preprocessor)

In [57]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [58]:
# nltk包实现了词干提取等功能，可以查看文档了解详情
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [64]:
# 下载常用停用词
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nolan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [63]:
# 使用停用词库
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

### 训练用于文档分类的逻辑回归模型

In [65]:
# 将数据分成train和test
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# 使用网格搜索来训练模型，由于数量众多的单词，这里只使用了2组参数。
# 第一组参数使用了TfidfVectorizer的默认值(use_idf=True, smooth_idf=True, norm='l2')
# 第二组参数使用了(use_idf=False, smooth_idf=False, norm=None)
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 1)], 
              'vect__stop_words': [stop, None], 
              'vect__tokenizer': [tokenizer, tokenizer_porter], 
              'clf__penalty': ['l1', 'l2'], 
              'clf__C': [1.0, 10.0, 100.0]}, 
              {'vect__ngram_range': [(1, 1)], 
               'vect__stop_words': [stop, None], 
               'vect__tokenizer': [tokenizer, tokenizer_porter], 
               'vect__use_idf': [False], 
               'vect__norm': [None], 
               'clf__penalty': ['l1', 'l2'], 
               'clf__C': [1.0, 10.0, 100.0]}
             ]
lr_tfidf = Pipeline([('vect', tfidf), 
                    ('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
