In [1]:
#导入数据分析的常用工具包

import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## 1.读入数据

In [7]:
#读入训练数据个测试数据

train = pd.read_csv('D:\\nlp_dataset\\train.tsv',sep = '\t')
test = pd.read_csv('D:\\nlp_dataset\\test.tsv',sep = '\t')

In [8]:
#查看训练数据的前5行
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


可以看出每一个样本都是由短语ID，句子ID，文本内容，情感标签组成。

In [9]:
train.shape

(156060, 4)

In [10]:
#查看测试数据的前5行
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [11]:
test.shape

(66292, 3)

可以看出，测试集中每个样本都是由短语ID，句子ID，文本内容构成。缺失的情感标签就是需要我们预测的。

## 2.构建语料库

In [12]:
train_sentences = train['Phrase']
test_sentences = test['Phrase']

sentences = pd.concat([train_sentences,test_sentences])

sentences.shape

(222352,)

In [13]:
#提取训练集中的情感作为标签
label = train['Sentiment']

In [14]:
label.shape

(156060,)

In [16]:
#导入停用词

stopwords = open('D:\\nlp_dataset\\stop_words.txt',encoding = 'utf-8').read().splitlines()

In [17]:
stopwords

["\ufeffain'",
 'happy',
 'isn',
 'ain',
 'al',
 'couldn',
 'didn',
 'doesn',
 'hadn',
 'hasn',
 'haven',
 'sn',
 'll',
 'mon',
 'shouldn',
 've',
 'wasn',
 'weren',
 'won',
 'wouldn',
 "'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'t",
 "'ve",
 'ZT',
 'ZZ',
 'a',
 "a's",
 'able',
 'about',
 'above',
 'abst',
 'accordance',
 'according',
 'accordingly',
 'across',
 'act',
 'actually',
 'added',
 'adj',
 'adopted',
 'affected',
 'affecting',
 'affects',
 'after',
 'afterwards',
 'again',
 'against',
 'ah',
 "ain't",
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'an',
 'and',
 'announce',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'apart',
 'apparently',
 'appear',
 'appreciate',
 'appropriate',
 'approximately',
 'are',
 'area',
 'areas',
 'aren',
 "aren't",
 'arent',
 'arise',
 'around',
 'as',
 'aside',
 'ask',
 'asked',
 'asking',
 

## 3.使用词袋模型进行文本特征工程

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

countvect = CountVectorizer(analyzer = 'word',
                           ngram_range = (1,4),
                           max_features = 150000,
                           stop_words = stopwords)

In [19]:
#使用语料库构建词袋模型
countvect.fit(sentences)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=150000, min_df=1,
        ngram_range=(1, 4), preprocessor=None,
        stop_words=["\ufeffain'", 'happy', 'isn', 'ain', 'al', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'sn', 'll', 'mon', 'shouldn', 've', 'wasn', 'weren', 'won', 'wouldn', "'d", "'ll", "'m", "'re", "'s", "'t", "'ve", 'ZT', 'ZZ', 'a', "a's", 'able', 'about', 'above', 'abst', 'accordance', 'accor...', ',', '·', '￥', '……', '（', '）', '——', '、', '：', '；', '“', '’', '《', '》', '，', '。', '、', '？', '★ '],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [20]:
#将原始的训练集分为新的训练集和验证集
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(train_sentences,label,test_size = 0.25,random_state = 1)
print('训练集的大小:',X_train.shape)
print('验证集的大小:',X_val.shape)

训练集的大小: (117045,)
验证集的大小: (39015,)


In [22]:
X_train.iloc[1]

"will be Greek to anyone not predisposed to the movie 's rude and crude humor ."

In [25]:
#使用前面构建好的词袋模型，将训练集个验证集中的每一个词转化为向量

X_train = countvect.transform(X_train)
X_val = countvect.transform(X_val)

In [26]:
X_train[1]

<1x150000 sparse matrix of type '<class 'numpy.int64'>'
	with 18 stored elements in Compressed Sparse Row format>

## 4.建立机器学习模型对文本数据进行训练

### 4.1建立逻辑回归模型

In [30]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression().fit(X_train,y_train)
LR_result = LR.score(X_val,y_val)
print('对文本进行了词袋模型处理，使用逻辑回归算法训练数据，在验证集上的准确率为{}'.format(LR_result))

对文本进行了词袋模型处理，使用逻辑回归算法训练数据，在验证集上的准确率为0.6411636550044855


### 4.2建立多项式朴素贝叶斯模型

In [31]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train,y_train)
nb_result = nb.score(X_val,y_val)
print('对文本进行了词袋模型处理，使用多项式贝叶斯算法训练数据，在验证集上的准确率为{}'.format(nb_result))

对文本进行了词袋模型处理，使用多项式贝叶斯算法训练数据，在验证集上的准确率为0.6094322696398821


## 5.使用TF-IDF对文本数据进行特征工程处理

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer = 'word',
                       ngram_range = (1,4),
                       max_features = 150000)

In [33]:
#建立tfidf模型
tfidf.fit(sentences)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=150000, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [35]:
#切分文本中的训练集数据
x_train,x_val,y_train,y_val = train_test_split(train_sentences,label,test_size=0.25,random_state=0)

In [36]:
#使用建立好的tfidf模型将训练集和验证集转化为向量
x_train = tfidf.transform(x_train)
x_val = tfidf.transform(x_val)

## 6.建立机器学习模型对文本数据进行训练并评估模型效果

### 6.1建立逻辑回归模型

In [38]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression().fit(x_train,y_train)
LR_tfidf_result = LR.score(x_val,y_val)
print('对文本进行了词袋模型处理，使用逻辑回归算法训练数据，在验证集上的准确率为{}'.format(LR_tfidf_result))

对文本进行了词袋模型处理，使用逻辑回归算法训练数据，在验证集上的准确率为0.6320133282070999


### 6.2建立多项式朴素贝叶斯模型

In [39]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(x_train,y_train)
nb_tfidf_result = nb.score(x_val,y_val)
print('对文本进行了词袋模型处理，使用多项式贝叶斯算法训练数据，在验证集上的准确率为{}'.format(nb_tfidf_result))

对文本进行了词袋模型处理，使用多项式贝叶斯算法训练数据，在验证集上的准确率为0.6038959374599513


对比上面的两种模型，逻辑回归算法的效果均要好于多项式贝叶斯算法，所以选择逻辑回归作为我们的最终模型。

## 7.使用GridSearchCV方法对模型进行调参处理

In [42]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
param_grid = {'C':range(1,10),
             'dual':[True,False]}

grid = GridSearchCV(LR,param_grid = param_grid, cv = 3,n_jobs = -1)
grid.fit(x_train,y_train)

GridSearchCV(cv=3, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'C': range(1, 10), 'dual': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
grid.best_params_

{'C': 5, 'dual': False}

In [46]:
best_model = grid.best_estimator_

In [47]:
#使用最好的模型，在验证集上进行评估
best_model.score(x_val,y_val)

0.6506215558118672

## 8.使用训练好的数据对训练集进行预测

In [48]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [49]:
test_vect = tfidf.transform(test_sentences)

In [50]:
test['Sentiment'] = best_model.predict(test_vect)

In [51]:
test.head(10)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,156061,8545,An intermittently pleasing but mostly routine ...,3
1,156062,8545,An intermittently pleasing but mostly routine ...,3
2,156063,8545,An,2
3,156064,8545,intermittently pleasing but mostly routine effort,3
4,156065,8545,intermittently pleasing but mostly routine,3
5,156066,8545,intermittently pleasing but,3
6,156067,8545,intermittently pleasing,3
7,156068,8545,intermittently,2
8,156069,8545,pleasing,3
9,156070,8545,but,2


In [52]:
test.columns

Index(['PhraseId', 'SentenceId', 'Phrase', 'Sentiment'], dtype='object')

In [54]:
test = test.loc[:,['PhraseId','Sentiment']]

In [55]:
test.head()

Unnamed: 0,PhraseId,Sentiment
0,156061,3
1,156062,3
2,156063,2
3,156064,3
4,156065,3


In [57]:
#将模型的预测结果保存为csv文件，然后提交至Kaggle

test.to_csv('D:\\nlp_dataset\\final_result.csv',index = False)