## 加载数据集

In [1]:
import pandas as pd

train_labled_path = 'nCoV_100k_train.labled.csv'
test_path='nCov_10k_test.csv'
df = pd.read_csv(train_labled_path, encoding='utf-8', usecols=[3,6]) #usecols选取第三列和第六列
df2 = pd.read_csv(test_path, encoding='utf-8', usecols=[0,3])
df = df[df['情感倾向'].isin(['0','-1','1'])]
print(df['情感倾向'].value_counts())

0     57619
1     25392
-1    16902
Name: 情感倾向, dtype: int64


## 中文分词

In [2]:
import jieba
df['微博中文内容']=df['微博中文内容'].map(str)
df['cuted']=df['微博中文内容'].map(lambda x:' '.join(jieba.cut(x)))
df2['微博中文内容']=df2['微博中文内容'].map(str)
df2['cuted']=df2['微博中文内容'].map(lambda x:' '.join(jieba.cut(x)))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\19366\AppData\Local\Temp\Mxt203\tmp\jieba.cache
Loading model cost 0.639 seconds.
Prefix dict has been built successfully.


In [3]:
print(df['cuted'][1])

开年 大 模型 … 累到 以为 自己 发烧 了 腰疼 膝盖 疼 腿疼 胳膊 疼 脖子 疼 # Luna 的 Krystallife # ?


## 训练集和测试集

In [4]:
# 输入和输出
X = df['cuted']
y = df['情感倾向']
x_ans = df2['cuted']

from sklearn.model_selection import train_test_split

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=0)

# 查看训练集
X_train.shape

(98913,)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer  #调用文本特征提取库

'''
变换器，把每一行文本变成词的向量形式。CountVectorizer()函数只考虑每个单词出现的频率；
然后构成一个特征矩阵，每一行表示一个训练文本的词频统计结果
'''
vect = CountVectorizer()

vect.fit(X_train)

# 词表数量
print(len(vect.vocabulary_))
# 打印词表
#print(vect.vocabulary_)

143120


In [6]:
#words_matrix = pd.DataFrame(vect.transform(X).toarray(),columns=vect.get_feature_names())

#words_matrix.head()

## 构建模型

In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# 交叉验证评估模型
scores = cross_val_score(LogisticRegression(),
                         vect.transform(X_train), y_train, cv=5)
print('平均交叉验证准确率：{:.3f}'.format(np.mean(scores)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

平均交叉验证准确率：0.710


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 去除停用词

In [8]:
def stopwords_list(d):  #设置停用词使不具代表性的词语不显示
    with open(d,'rb') as f:
        lines = f.readlines()
        result = [i.decode().strip('\n') for i in lines]
    return result

stopwords = stopwords_list('hit_stopwords.txt')
stopwords.extend(stopwords_list('cn_stopwords.txt'))
stopwords.extend(stopwords_list('baidu_stopwords.txt'))
stopwords.extend(stopwords_list('scu_stopwords.txt'))

In [9]:
vect = CountVectorizer(max_df=0.8, min_df=3, stop_words=stopwords,
                       token_pattern=u'(?u)\\b[^\\d\\W]\\w+\\b')

vect.fit(X_train)
'''
max_df	可以设置为范围在[0.0 1.0]的float，也可以设置为没有范围限制的int，默认为1.0。
这个参数的作用是作为一个阈值，当构造语料库的关键词集的时候，如果某个词的document frequence大于max_df，
这个词不会被当作关键词。如果这个参数是float，则表示词出现的次数与语料库文档数的百分比，如果是int，则表示词出现的次数。
如果参数中已经给定了vocabulary，则这个参数无效。
min_df	类似于max_df，不同之处在于如果某个词的document frequence小于min_df，则这个词不会被当作关键词
'''



CountVectorizer(max_df=0.8, min_df=3,
                stop_words=['———', '》），', '）÷（１－', '”，', '）、', '＝（', ':', '→',
                            '℃ ', '&', '*', '一一', '~~~~', '’', '. ', '『', '.一',
                            './', '-- ', '』', '＝″', '【', '［＊］', '｝＞', '［⑤］］',
                            '［①Ｄ］', 'ｃ］', 'ｎｇ昉', '＊', '//', ...],
                token_pattern='(?u)\\b[^\\d\\W]\\w+\\b')

In [10]:
print(vect.get_feature_names())

['__', '___', '____', '_____', '__________', 'a1', 'a2n', 'a3g389', 'a4', 'a50', 'a6g126', 'a6hzqlzo', 'a6pfl4v7', 'a6pfslok', 'a6pinnpb', 'a6piph89', 'a6pk09lq', 'a6pnneiy', 'a6px4ivp', 'a6pxmou3', 'a6pz9kzz', 'aa', 'aaa', 'ab', 'ababebaci', 'abc', 'abc2017', 'abcd', 'absolutecb', 'abyss', 'ac', 'acca', 'ace', 'ace2', 'acfc', 'ad', 'adam0616', 'aed', 'aeolus', 'aerosol', 'aesopbach', 'aesopbach75', 'afc', 'afrabot', 'agent', 'ai', 'aiba', 'aibofold', 'aids', 'aifxd1ng', 'aikkkkkkkkkkkkk', 'airpods', 'ait9zhe9', 'aj', 'ak', 'ak20190515', 'aka', 'akalui', 'akb48teamsh', 'aki', 'akira', 'akiramiya', 'alan', 'alan8616', 'alex', 'alexie', 'aling', 'allen', 'alliance', 'allvis', 'almighty', 'aloha', 'alter', 'am37', 'amazing', 'amortentiaz', 'amy', 'ana', 'anan', 'anaparastasi', 'androktasiai', 'ane', 'angel', 'angelababy', 'angeladoge', 'anita', 'ann606', 'anna', 'annnnnnn', 'anqi', 'ao', 'aoa', 'aoide', 'aokawa', 'ap', 'app', 'appo', 'appstore', 'aqi', 'ar', 'arashi', 'ariel', 'ariellecte

In [11]:
#words_matrix = pd.DataFrame(vect.transform(X_train).toarray(),columns=vect.get_feature_names())

In [12]:
# 训练模型
lr=LogisticRegression()
lr.fit(vect.transform(X_train), y_train)

print('测试集准确率：{:.3f}'.format(lr.score(vect.transform(X_test), y_test)))

测试集准确率：0.683


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## 用tf-idf缩放数据

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(TfidfVectorizer(min_df=3), LogisticRegression())  #使用pipline机制
pipe.fit(X_train, y_train)
scores = cross_val_score(pipe, X_train, y_train, cv=5)
print('平均交叉验证准确率：{:.3f}'.format(np.mean(scores)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

平均交叉验证准确率：0.731


In [14]:
vectorizer = pipe.named_steps['tfidfvectorizer']
# 找到每个特征中最大值
max_value = vectorizer.transform(X_train).max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
# 获取特征名称
feature_names = np.array(vectorizer.get_feature_names())

print("tfidf较低的特征：\n{}".format(feature_names[sorted_by_tfidf[:20]]))
print()
print("tfidf较高的特征：\n{}".format( feature_names[sorted_by_tfidf[-20:]]))

tfidf较低的特征：
['祝迪丽' 'rakuen' 'ellis' '沙宣型' '唯粉霞霞' '饭心' '勉励' '模具' '芒果汁' '桃泥' '分满' '百丽'
 '智点' '巡店' '却表' '部省' '15813314017' '堪舆' '堪舆家' '掌握分寸']

tfidf较高的特征：
['难听' '歌唱' '很漂亮' '一天' '不要' '难搞' '安排' '安心' '影评' '脊梁' '彻底' '轻症' '鸡汤' '难过'
 '安全' '守望相助' '华晨' '祈福' '没人管' '赵赵']


In [15]:
from sklearn import metrics

# 预测值
y_pred = pipe.predict(X_test)

print('测试集准确率：{:.3f}'.format(metrics.accuracy_score(y_test, y_pred)))
print('测试集准确率：{:.3f}'.format(pipe.score(X_test, y_test)))

metrics.confusion_matrix(y_test, y_pred)

测试集准确率：0.715
测试集准确率：0.715


array([[ 63,  92,   7],
       [ 37, 518,  42],
       [ 10,  97, 134]], dtype=int64)

In [16]:
print(y_pred)

['0' '0' '0' '0' '1' '0' '0' '-1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0'
 '0' '0' '0' '0' '0' '0' '0' '1' '0' '0' '-1' '1' '0' '0' '1' '0' '0' '-1'
 '0' '0' '-1' '1' '1' '0' '0' '0' '0' '1' '0' '0' '0' '0' '1' '0' '0' '0'
 '0' '0' '0' '1' '0' '-1' '1' '0' '0' '0' '1' '-1' '0' '1' '0' '0' '0' '0'
 '0' '0' '0' '1' '0' '0' '0' '0' '-1' '0' '0' '1' '0' '0' '1' '0' '0' '0'
 '0' '0' '0' '1' '0' '-1' '0' '0' '0' '0' '-1' '0' '1' '-1' '0' '0' '0'
 '0' '0' '-1' '0' '0' '1' '-1' '0' '0' '0' '0' '0' '1' '1' '1' '0' '-1'
 '0' '0' '-1' '0' '0' '1' '0' '0' '1' '0' '0' '0' '-1' '1' '0' '0' '1' '0'
 '0' '-1' '0' '-1' '0' '-1' '1' '-1' '-1' '0' '0' '0' '0' '0' '0' '1' '1'
 '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '1' '0' '1' '0' '0' '0' '1' '0'
 '1' '1' '0' '0' '1' '0' '0' '1' '-1' '0' '0' '0' '0' '1' '0' '0' '1' '0'
 '0' '0' '0' '0' '-1' '0' '0' '1' '0' '1' '0' '1' '0' '1' '1' '-1' '0' '0'
 '1' '-1' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '0' '1' '0' '0'
 '1' '0' '0' '1' '0' '-1' '1' '1' '0' '

In [17]:
print(pd.value_counts(y_pred))

0     707
1     183
-1    110
dtype: int64


## 输出

In [18]:
y_ans = pipe.predict(x_ans)

In [19]:
y_ans = pipe.predict(x_ans)
dict={"测试数据id":df2['微博id'].values.tolist(),'情感极性':y_ans.tolist()}
output_list = [df2['微博id'].values.tolist(),y_ans.tolist()]
output = pd.DataFrame(dict)
output.to_csv("./submit1.csv",sep=',',index=False)

In [20]:
print(output_list)

[[4456068992182160, 4456424178427250, 4456797466940200, 4456791021108920, 4457086404997440, 4457152129019640, 4457827361629760, 4458609708660190, 4458599680017930, 4458853061816280, 4459689523639890, 4459663745597810, 4458970833619120, 4460049088641370, 4460408020984770, 4460044734718810, 4460057116667200, 4460053245947360, 4460003047662520, 4460406158654420, 4460417973233560, 4460420603967690, 4460666649465060, 4460783620098110, 4461139519472380, 4461131160371150, 4461488888731490, 4462104544972710, 4462559836736440, 4462918689545870, 4463222482234740, 4463282205450510, 4468230968346510, 4468341035434870, 4468603934195360, 4463919710511440, 4464045383900520, 4464399178120380, 4464407126239860, 4464303207479990, 4470181806452450, 4470051229237290, 4470146385254120, 4464765973655970, 4465132082049670, 4465131238564520, 4465855921977370, 4455748258222440, 4466191457940130, 4466877365962200, 4467669631499950, 4467574181502060, 4468372098269660, 4460795306441440, 4460953988723200, 44691029