# 在情感分析上应用机器学习

## 获取IMDb的电影评论数据集

In [2]:
# 加载数据
import pyprind
import pandas as pd
import os

basepath = './aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']


  return f(*args, **kwds)
  return f(*args, **kwds)
0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:05:30


In [12]:
import numpy as np
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('./movie_data.csv', index=False)


In [3]:
df = pd.read_csv('./movie_data.csv')
df.head(3)


Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


## 介绍bag-of-words模型

- 从整个文件集中创建一个唯一标识的词汇表(这里的标识就是单词)
- 为每一个文件建立一个特征向量,包含每一个单词出现的次数

### 将单词转换为特征向量

In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array([
    'The sun is shining',
    'The weather is sweet',
    'The sun is shining and the weather is sweet'
])
bag = count.fit_transform(docs)


In [11]:
print(sorted(count.vocabulary_.items()))

[('and', 0), ('is', 1), ('shining', 2), ('sun', 3), ('sweet', 4), ('the', 5), ('weather', 6)]


In [6]:
print(bag.toarray())

[[0 1 1 1 0 1 0]
 [0 1 0 0 1 1 1]
 [1 2 1 1 1 2 1]]


### 通过tf-idf评估单词相关性

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)).toarray())


[[0.   0.43 0.56 0.56 0.   0.43 0.  ]
 [0.   0.43 0.   0.   0.56 0.43 0.56]
 [0.4  0.48 0.31 0.31 0.31 0.48 0.31]]


### 清洗文本数据

In [12]:
df.loc[0,'review'][-50:]

'is seven.<br /><br />Title (Brazil): Not Available'

In [13]:
import re


def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emotions = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emotions).replace('-', '')
    return text


In [15]:
# 验证preprocessor是否工作
preprocessor(df.loc[0,'review'][-50:])

'is seven title brazil not available'

In [16]:
preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [17]:
# 把preprocessor应用到所有的review上
df['review'] = df['review'].apply(preprocessor)