## 使用机器学习进行情感分析
### 获取数据

In [1]:
%matplotlib inline
# dataset: http://ai.stanford.edu/~amaas/data/sentiment/
import pyprind
import pandas as pd
import os

pbar = pyprind.ProgBar(50000)
labels = {'pos':1,  'neg':0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = './aclImdb/%s/%s' % (s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r',encoding="utf-8") as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:01:46


In [2]:
import numpy as np
# 读取的数据类标是经过排序的，并且出于方便使用，我们将数据打乱顺序然后保存到CSV文件中。
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)

In [3]:
df = pd.read_csv('./movie_data.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,My family and I normally do not watch local mo...,1
1,"Believe it or not, this was at one time the wo...",0
2,"After some internet surfing, I found the ""Home...",0


### 词袋模型简介

In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# ngram_range设置n元祖模型，默认是1元祖
# count = CountVectorizer(ngram_range=(2, 2))
count = CountVectorizer()
docs = np.array([
        'The sun is shining',
        'The weather is sweet',
        'The sun is shining and the weather is sweet'
    ])
# 创建词袋模型的词汇库，并将上面3个句子转换成稀疏的特征向量。
bag = count.fit_transform(docs)

In [5]:
print(count.vocabulary_)

{'the': 5, 'sun': 3, 'is': 1, 'shining': 2, 'weather': 6, 'sweet': 4, 'and': 0}


In [6]:
# 特征向量的每个索引位置与上面存储的整数值对应。例如，索引0为and只在第三个句子中出现过，索引1的is在3个句子中都出现过。
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


#### tf-idf说明
df(t, d) 为包含词汇t的文档d的数量  
$ n_{d}$ 为文档的总数  
$idf(t, d) = log\frac{n_{d}}{1 + df(d, t))}$  

sklearn中使用的tf-idf公式为   
tf-idf(t, d) = tf(t, d) * (idf(t, d) + 1)  

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
# 将上面的词频转换成tf-idf
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())

[[ 0.    0.43  0.56  0.56  0.    0.43  0.  ]
 [ 0.    0.43  0.    0.    0.56  0.43  0.56]
 [ 0.4   0.48  0.31  0.31  0.31  0.48  0.31]]


In [8]:
df.loc[0, 'review'][-50:]
# 可以看到原始数据中包含很多特殊字符，还有html标签

'to Star Cinema!! Way to go, Jericho and Claudine!!'

In [9]:
import re
def preprocessor(text):
    # 对数据进行处理，使用正则表达式去掉所有html标签
    text = re.sub('<[^>]*>', '', text)
    # 寻找所有的表情符
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # 删除所有非单词字符，并将所有文字转换成小写，将表情符加在文末
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    return text

In [10]:
preprocessor(df.loc[0, 'review'])[-50:]

'ons to star cinema way to go jericho and claudine '

In [11]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [12]:
# 替换所有评论
df['review'] = df['review'].apply(preprocessor)

In [13]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [14]:
# nltk包实现了词干提取等功能，可以查看文档了解详情
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [15]:
# 下载常用停用词
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/linux/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# 使用停用词库
from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes running and runs a lot')[-10:] if w not in stop]

['runner', 'like', 'run', 'run', 'lot']

### 训练用于文档分类的逻辑回归模型

In [17]:
# 将数据分成train和test
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# 使用网格搜索来训练模型，由于数量众多的单词，这里只使用了2组参数。
# 第一组参数使用了TfidfVectorizer的默认值(use_idf=True, smooth_idf=True, norm='l2')
# 第二组参数使用了(use_idf=False, smooth_idf=False, norm=None)
tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
param_grid = [{'vect__ngram_range': [(1, 1)], 
              'vect__stop_words': [stop, None], 
              'vect__tokenizer': [tokenizer, tokenizer_porter], 
              'clf__penalty': ['l1', 'l2'], 
              'clf__C': [1.0, 10.0, 100.0]}, 
              {'vect__ngram_range': [(1, 1)], 
               'vect__stop_words': [stop, None], 
               'vect__tokenizer': [tokenizer, tokenizer_porter], 
               'vect__use_idf': [False], 
               'vect__norm': [None], 
               'clf__penalty': ['l1', 'l2'], 
               'clf__C': [1.0, 10.0, 100.0]}
             ]
lr_tfidf = Pipeline([('vect', tfidf), 
                    ('clf', LogisticRegression(random_state=0))])
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 23.0min
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed: 113.3min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 152.8min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__stop_words': [['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's...se_idf': [False], 'vect__norm': [None], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
    

In [19]:
# 可以看到最佳参数集，使用不含有停用词的常规标记，同时在逻辑回归中使用tf-idf，逻辑回归使用L2正则化，正则强度C=10.0
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

Best parameter set: {'clf__C': 10.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x1165c2ae8>} 


In [20]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

CV Accuracy: 0.893


In [21]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Test Accuracy: 0.900


### 使用大数据--在线算法与外存学习

In [22]:
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [23]:
# 每次返回一个文档的内容
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv) # skip header
        for line in csv:
            text, label = line[:-3], int(line[-2])
            yield text, label

In [24]:
next(stream_docs(path='./movie_data.csv'))

('"My family and I normally do not watch local movies for the simple reason that they are poorly made, they lack the depth, and just not worth our time.<br /><br />The trailer of ""Nasaan ka man"" caught my attention, my daughter in law\'s and daughter\'s so we took time out to watch it this afternoon. The movie exceeded our expectations. The cinematography was very good, the story beautiful and the acting awesome. Jericho Rosales was really very good, so\'s Claudine Barretto. The fact that I despised Diether Ocampo proves he was effective at his role. I have never been this touched, moved and affected by a local movie before. Imagine a cynic like me dabbing my eyes at the end of the movie? Congratulations to Star Cinema!! Way to go, Jericho and Claudine!!"',
 1)

In [25]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [26]:
# 由于CountVectorizer和TfidfVectorizer都需要将所有单词加载在内存来进行计算，所以这里使用另外一种处理文本信息的算法
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
# 设定特征的数量为2**21，设定一个比较大的值可以有效的降低哈希碰撞的概率，不过增加了逻辑回归模型中系数的数量
vect = HashingVectorizer(decode_error='ignore',
                        n_features=2 ** 21,
                        preprocessor=None,
                        tokenizer=tokenizer)
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs(path='./movie_data.csv')

In [27]:
import pyprind

pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])

# 我们用前45000条数据做训练，可以看到训练速度非常快
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:31


In [28]:
# 用后面的5000条数据来进行测试
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.868


In [29]:
# 使用test数据继续升级模型
clf = clf.partial_fit(X_test, y_test)

In [30]:
import pickle
import os

dest = os.path.join('movieclassifier','pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

### word2vec尝试

In [31]:
import multiprocessing
from gensim.models import word2vec
# http://mattmahoney.net/dc/text8.zip 使用的text8数据集来训练word2vec模型

sentences = word2vec.Text8Corpus("/Users/linux/Downloads/text8")
# 训练模型
model = word2vec.Word2Vec(sentences, size=200, workers=multiprocessing.cpu_count())
# 可以将模型保存下次使用
model.save("text8.model")

In [32]:
# build a sklearn-compatible transformer that is initialised with a word -> vector dictionary.
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = self.word2vec.vector_size

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec.wv[w] for w in tokenizer(words) if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [33]:
# 加载模型
w2v = word2vec.Word2Vec.load("text8.model")
mmv = MeanEmbeddingVectorizer(w2v)

In [34]:
# 开始模型训练
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
doc_stream = stream_docs("movie_data.csv")
pbar = pyprind.ProgBar(45)
for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = mmv.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()
    
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test_mmv = mmv.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test_mmv, y_test))

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:43


Accuracy: 0.741
