# Kaggle挑战:烂番茄评论的情感分析
> 东南大学 王颢迪 2020-8-2

Kaggle挑战地址：https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 由于给定的数据文件为tsv格式，即分隔符为制表符\t
train = pd.read_csv('./train.tsv',sep='\t')
test = pd.read_csv('./test.tsv',sep='\t')

In [3]:
# 查看训练集的前5行,可以看到训练集共有四列，其中SentenceId为句子编号，Phrase为电影评论，其中数据集已自动为句子成分做了分割
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


- 数据集中所给出的情感标签
- 0 - negative 消极
- 1 - somewhat negative 比较消极
- 2 - neutral 中性
- 3 - somewhat positive 比较积极
- 4 - positive 积极

In [4]:
# 查看一下训练集的纬度
train.shape

(156060, 4)

In [5]:
# 查看测试集的前5行,Phrase列就是我们要测试的文本
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [6]:
#共66292条数据
test.shape

(66292, 3)

# 语料库构建
* 对文本处理，将数据集中的文本变成计算机可以读懂的向量形式，
* 这里采用TF_IDF模型实现向量形式的转换，首先需要把训练集和测试集组合，构建语料库

In [7]:
# 训练集中的文本内容
xtr = train['Phrase']
# 测试集中的文本内容
xte = test['Phrase']

# concat函数将训练集和测试集合并
sentences = pd.concat([xtr, xte])
# 共222352行
sentences.shape

(222352,)

In [8]:
# 提取训练集中的样本标签
y = train['Sentiment']
y.shape

(156060,)

In [9]:
# english.txt中为无效的英文词汇和语气词，这些内容对情感分析无太大意义
stopword = open('english.txt', encoding = 'utf-8').read().splitlines()
stopword

["ain'",
 'happy',
 'isn',
 'ain',
 'al',
 'couldn',
 'didn',
 'doesn',
 'hadn',
 'hasn',
 'haven',
 'sn',
 'll',
 'mon',
 'shouldn',
 've',
 'wasn',
 'weren',
 'won',
 'wouldn',
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'t",
 "'ve",
 'ZT',
 'ZZ',
 'a',
 "a's",
 'able',
 'about',
 'above',
 'abst',
 'accordance',
 'according',
 'accordingly',
 'across',
 'act',
 'actually',
 'added',
 'adj',
 'adopted',
 'affected',
 'affecting',
 'affects',
 'after',
 'afterwards',
 'again',
 'against',
 'ah',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'announce',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'apparently',
 'appear',
 'appreciate',
 'appropriate',
 'approximately',
 'are',
 'area',
 'areas',
 'aren',
 "aren't",
 'arent',
 'arise',
 'around',
 'as',
 'aside',
 'ask',
 'asked',
 'asking',
 'asks'

# 采用词袋模型进行文本特征编码

In [10]:
# 采用sklearn库中的CountVectorizer构建词袋模型
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer(
    analyzer='word', # 以词为单位进行分析，有时则需要以字母‘character’为单位进行分析，如拉丁语系
    ngram_range=(1,4), # 指分析相邻的几个词，避免词序丢失的问题
    stop_words=stopword, # 应用停用词后，停用词列表中的所有单词将会在模型训练中被剔除
    max_features=150000 # 指最终的TF-IDF模型中包含语料库中出现次数最多的150000个词
)

In [11]:
#使用语料库，构建词袋模型
c.fit(sentences)

# 以默认3:1的比率将训练集拆分为新的训练集和验证集，然后进行词频统计
from sklearn.model_selection import train_test_split
a_train, a_test, b_train, b_test = train_test_split(xtr, y, random_state = 12)

# 将训练集和验证集中的每一个词转变为向量形式
a_train = c.transform(a_train)
a_test = c.transform(a_test)

a_train[1]

<1x150000 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [12]:
import warnings
warnings.filterwarnings('ignore')

In [13]:
# 采用逻辑回归分类器，得出验证集上的准确率
from sklearn.linear_model import LogisticRegression
lg1 = LogisticRegression()
lg1.fit(a_train, b_train)
print('验证集上的预测准确率为:',lg1.score(a_test, b_test))

验证集上的预测准确率为: 0.6432654107394592


# 采用TF-IDF模型进行文本特征编码
* 词频(Term Frequency,TF):指的是某一个给定的词语在该文档中出现的次数
* 逆向文件频率 (inverse document frequency, IDF) 是一个词语普遍重要性的度量，其值越小，表明该词的价值越低
* TF与IDF的乘积TF_IDF表明了一个词语的出现所带来的特异性信息
* 比如“北京”“故宫”两个词在某些情况下会同时出现，但是由于“北京”一词在许多文档中可能会经常出现，IDF值很低，而“故宫”一词经常会在特定的文档中出现，那么，“故宫”一词所能带来的特异性信息就会很多

In [14]:
# 采用sklearn库中的TfidfVectorizer构建TF-IDF模型
from sklearn.feature_extraction.text import TfidfVectorizer
z = TfidfVectorizer(
    analyzer = 'word',  # 以词为单位进行分析，有时则需要以字母‘character’为单位进行分析，如拉丁语系
    ngram_range = (1, 4),  # 指分析相邻的几个词，避免词序丢失的问题
    # stop_words = stopword, # 应用停用词后验证集的准确率降为63%，停用词列表中的所有单词将会在模型训练中被剔除
    max_features = 150000 # 指最终的TF-IDF模型中包含语料库中出现次数最多的150000个词
)

In [15]:
# 使用语料库构建TF-IDF模型
z.fit(sentences)

TfidfVectorizer(max_features=150000, ngram_range=(1, 4))

In [16]:
# 以默认3:1的比率将训练集拆分为新的训练集和验证集，然后进行词频统计
from sklearn.model_selection import train_test_split
a_train, a_test, b_train, b_test = train_test_split(xtr, y, random_state = 12)

In [17]:
# 将训练集和验证集中的每一个词转变为向量形式
a_train = z.transform(a_train)
a_test = z.transform(a_test)

In [18]:
a_train[1]

<1x150000 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [19]:
# 采用sklearn提供的网格搜索功能对超参数进行批量试验。
# C:正则化系数，其值越小，正则化效果越强
# dual：是否求解原问题的对偶问题
# 这里C从1到5，对每一个C的值，都进行参数dual为true和false的试验,共5*2=10种情况

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = {'C':range(1,5),
             'dual':[True,False]
              }
lgGS = LogisticRegression()
grid = GridSearchCV(lgGS, param_grid = param_grid, cv = 3, n_jobs = -1) 
grid.fit(a_train, b_train)

GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': range(1, 5), 'dual': [True, False]})

In [20]:
# 挑选出能使模型在验证集上预测准确率最高的超参数
grid.best_params_

{'C': 4, 'dual': False}

In [21]:
# 构建分类器
lg_final = grid.best_estimator_
print('验证集上的预测准确率为:',lg_final.score(a_test,b_test))

验证集上的预测准确率为: 0.6521337946943483


In [22]:
# 使用TF-IDF将测试集转换为向量形式
result_x = z.transform(test['Phrase'])
predictions = lg_final.predict(result_x)
predictions

array([2, 2, 2, ..., 2, 2, 1])

In [23]:
predictions.shape

(66292,)

In [24]:
test.loc[:,'Sentiment'] = predictions
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,156061,8545,An intermittently pleasing but mostly routine ...,2
1,156062,8545,An intermittently pleasing but mostly routine ...,2
2,156063,8545,An,2
3,156064,8545,intermittently pleasing but mostly routine effort,2
4,156065,8545,intermittently pleasing but mostly routine,2


In [25]:
final = test.loc[:, ['PhraseId', 'Sentiment']]
final.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,2
1,156062,2
2,156063,2
3,156064,2
4,156065,2


In [26]:
final.to_csv('final_data.csv', index = None)