Data: https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews/data?select=train.tsv.zip Tutorial:https://www.bilibili.com/video/BV1ix411d7Fw

In [1]:
import pandas as pd
train = pd.read_csv('./train.tsv',sep='\t')  #制表符\t
test = pd.read_csv('./test.tsv',sep='\t')

In [3]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


- 0: negative
- 1: somewhat negative
- 2: neutral
- 3: somewhat positive
- 4: positive

In [4]:
train.shape

(156060, 4)

### 构建语料库
- 文本特征工程
- 将词转变成词向量：词袋模型、TF-IDF模型，以及视频中介绍的word2vec模型
- 首先把Train和Test中所有文本本人组合在一起，构建语料库

In [10]:
train_sentences=train['Phrase']
test_sentences=test['Phrase']
sentences=pd.concat([train_sentences,test_sentences])

In [12]:
sentences.shape

(222352,)

In [13]:
label = train['Sentiment']
label.shape

(156060,)

In [17]:
# 导入停词库（废话和语气词等）
stop_words = open('stopwords.txt',encoding='utf-8').read().splitlines()
# https://gist.githubusercontent.com/larsyencken/1440509/raw/53273c6c202b35ef00194d06751d8ef630e53df2/stopwords.txt

In [18]:
stop_words

['a',
 'about',
 'above',
 'across',
 'after',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'among',
 'an',
 'and',
 'another',
 'any',
 'anybody',
 'anyone',
 'anything',
 'anywhere',
 'are',
 'area',
 'areas',
 'around',
 'as',
 'ask',
 'asked',
 'asking',
 'asks',
 'at',
 'away',
 'b',
 'back',
 'backed',
 'backing',
 'backs',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'been',
 'before',
 'began',
 'behind',
 'being',
 'beings',
 'best',
 'better',
 'between',
 'big',
 'both',
 'but',
 'by',
 'c',
 'came',
 'can',
 'cannot',
 'case',
 'cases',
 'certain',
 'certainly',
 'clear',
 'clearly',
 'come',
 'could',
 'd',
 'did',
 'differ',
 'different',
 'differently',
 'do',
 'does',
 'done',
 'down',
 'down',
 'downed',
 'downing',
 'downs',
 'during',
 'e',
 'each',
 'early',
 'either',
 'end',
 'ended',
 'ending',
 'ends',
 'enough',
 'even',
 'evenly',
 'ever',
 'every',
 'everybody',
 'everyone',
 'everything',

#### Bag of words
- 丧失语序
- 词频高的一些词比如a an the对句子含义几乎没有贡献

In [25]:
# Bag of Words 词袋模型
from sklearn.feature_extraction.text import CountVectorizer
co = CountVectorizer(ngram_range=(1,4),analyzer='word',stop_words=stop_words,
                    max_features=150000)

#analyzer='word' 指的是以词为单位进行分析，
#对于拉丁语系语言，有时需要以字母'character'为单位进行分析
# ngram指分析相邻的几个词，避免原始的词袋模型中次序丢失的问题
# max_features指最终的词袋矩阵里面包含语料库中出现次数最多的多少个词


In [26]:
# 使用语料库，构建词袋模型
co.fit(sentences)

CountVectorizer(max_features=150000, ngram_range=(1, 4),
                stop_words=['a', 'about', 'above', 'across', 'after', 'again',
                            'against', 'all', 'almost', 'alone', 'along',
                            'already', 'also', 'although', 'always', 'among',
                            'an', 'and', 'another', 'any', 'anybody', 'anyone',
                            'anything', 'anywhere', 'are', 'area', 'areas',
                            'around', 'as', 'ask', ...])

In [27]:
# 将训练集随机拆分为新的训练集和验证集，默认3：1，然后进行词频统计
# 在机器学习中，训练集相当于课后习题，验证集相当于模拟考试，测试集相当于高考
# 新的训练集和验证集都来自于最初的训练集，都是有标签的
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_sentences,label,random_state=1234)

In [28]:
x_train[5]

'of escapades demonstrating the adage that what is good for the goose'

In [29]:
# 用上面构建的词袋模型，把train和test中的每一个词都进行特征工程，变成向量
x_train = co.transform(x_train)
x_test = co.transform(x_test)
#稀疏矩阵

In [31]:
x_train[5]

<1x150000 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [32]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
from sklearn.linear_model import LogisticRegression
lg=LogisticRegression()
lg.fit(x_train,y_train)
print('Bag of Words- Text Feature Engineering')
print('Sklearn - Logistic Regression, testing accuracy: ',lg.score(x_test,y_test))

Bag of Words- Text Feature Engineering
Sklearn - Logistic Regression, testing accuracy:  0.6476227092144047


#### 多项式朴素贝叶斯分类器 Naive Bayesian Classifier

In [34]:
from sklearn.naive_bayes import MultinomialNB
classifier=MultinomialNB()
classifier.fit(x_train,y_train)
print('Bag of Words- Text Feature Engineering')
print('Sklearn - Naive Bayesian, testing accuracy: ',classifier.score(x_test,y_test))

Bag of Words- Text Feature Engineering
Sklearn - Naive Bayesian, testing accuracy:  0.6044341919774445


#### TF-IDF模型
- TF = X在当前文章出现的次数/X在整个语料库出现的次数
- IDF = ln（语料库总文档数/语料库中X出现的文档数）
- TF-IDF = TF*IDF
- 解决BOW第二给问题

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(analyzer='word',
                  ngram_range=(1,4),
                  max_features=150000)

In [36]:
tf.fit(sentences)

TfidfVectorizer(max_features=150000, ngram_range=(1, 4))

In [37]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_sentences,label,random_state=1234)

In [38]:
x_train = tf.transform(x_train)
x_test = tf.transform(x_test)

In [40]:
x_train[2]

<1x150000 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in Compressed Sparse Row format>

In [41]:
classifier=MultinomialNB()
classifier.fit(x_train,y_train)
print('TF-IDF- Text Feature Engineering')
print('Sklearn - Naive Bayesian, testing accuracy: ',classifier.score(x_test,y_test))

TF-IDF- Text Feature Engineering
Sklearn - Naive Bayesian, testing accuracy:  0.6045367166474432


In [42]:
lg=LogisticRegression()
lg.fit(x_train,y_train)
print('TF-IDF - Text Feature Engineering')
print('Sklearn - Logistic Regression, testing accuracy: ',lg.score(x_test,y_test))

TF-IDF - Text Feature Engineering
Sklearn - Logistic Regression, testing accuracy:  0.640881712161989


In [45]:
# C: 正则化系数，C越小，正则化效果越强
# dual: 求解原问题的对偶问题
lg2=LogisticRegression(C=3,dual=True,solver='liblinear')
lg2.fit(x_train,y_train)
print('TF-IDF - Text Feature Engineering')
print('Sklearn - Logistic Regression with 2 more parameters, testing accuracy: ',lg2.score(x_test,y_test))

# 正则化有L1,L2两种。主要是为了防止过拟合。

TF-IDF - Text Feature Engineering
Sklearn - Logistic Regression with 2 more parameters, testing accuracy:  0.6533384595668332


In [46]:
# 逻辑回归中加C和dual两个参数可以提高验证集上的预测准确率。
# GridSearch辅助调参
from sklearn.model_selection import GridSearchCV  # cv: cross validation
param_grid={'C':range(1,10),
           'dual':[True,False]}
lgGS=LogisticRegression()
grid = GridSearchCV(lgGS,param_grid=param_grid,cv=3,n_jobs=-1)
grid.fit(x_train,y_train)

GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': range(1, 10), 'dual': [True, False]})

In [48]:
grid.best_params_

{'C': 3, 'dual': False}

In [49]:
lg_final=grid.best_estimator_

In [51]:
print('With GridSearch, Logistic Regeression- Testing accuracy:',lg_final.score(x_test,y_test))

With GridSearch, Logistic Regeression- Testing accuracy: 0.6524157375368448


In [52]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [53]:
# 使用TF-IDF对测试集中的文本进行特征工程
test_X = tf.transform(test['Phrase'])

In [56]:
predictions = lg_final.predict(test_X)

In [57]:
predictions

array([2, 2, 2, ..., 2, 2, 1], dtype=int64)

In [58]:
predictions.shape

(66292,)

In [59]:
test.loc[:,'Sentiment']=predictions

In [60]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,156061,8545,An intermittently pleasing but mostly routine ...,2
1,156062,8545,An intermittently pleasing but mostly routine ...,2
2,156063,8545,An,2
3,156064,8545,intermittently pleasing but mostly routine effort,2
4,156065,8545,intermittently pleasing but mostly routine,2


In [61]:
final_data = test.loc[:,['PhraseId','Sentiment']]

In [63]:
final_data.tail()

Unnamed: 0,PhraseId,Sentiment
66287,222348,1
66288,222349,1
66289,222350,2
66290,222351,2
66291,222352,1


In [64]:
final_data.to_csv('final_data.csv',index=None)