In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, ComplementNB, BernoulliNB
from sklearn.metrics import brier_score_loss as BS
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import CalibratedClassifierCV

| 字段   | 说明                           |
| :----- | :----------------------------- |
| label  | 0 喜悦，1 愤怒，2 厌恶，3 低落 |
| review | 微博内容                       |

In [2]:
data = pd.read_pickle("simplifyweibo_4_moods_preprocessing.pkl.bz2")

data.sample(20)

Unnamed: 0,label,review
261187,厌恶,回复 李健 苏打 绿 便宜 为啥 深圳 票 血贵 事宜 艺人 包 路费 美若天仙 闭月羞花 ...
360936,低落,希望 亲爱 愿望 德芙 心声 巧克力 想 北京 喜欢 北京 想 看菲姐 演唱会 颗 巧克力 ...
337569,低落,潮妈 记者 拍 弟弟 拍 想 未来 娱乐圈 工作
126914,喜悦,神苏 不淡定 逆 天 真的 大神 一晃眼 看成 这逆 几岁 背心 杀
309353,低落,买 一套 米 攒 一套 米 时间 哭 感谢 分享 肌肤 炎症 肌肤 老化 这是 新品 发布会...
27679,喜悦,支持 下本 微博 游戏 中秋 月 日 抢 特别 活动 参与 方式 月 日前 评论 本微博 通...
166893,喜悦,钢炼 日本 动画 超级 神作 剧场版 年 夏 上映 钢炼 爱德华 艾尔 冯斯 兄弟 之间 羁...
121382,喜悦,这一版 李 乃文 纯洁 天 真的 玻璃 纯洁 天 真的 水流 月 日 月 日 上海 剧院 马...
38991,喜悦,麻木 无痛 橡皮 梦想 伙伴 雄心壮志
76795,喜悦,典 油画 现 代 演绎 视频 优酷 视频 在线 观看


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 361744 entries, 0 to 361743
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   361744 non-null  object
 1   review  361744 non-null  object
dtypes: object(2)
memory usage: 5.5+ MB


In [4]:
data["label"].value_counts()

喜悦    199496
厌恶     55267
低落     55267
愤怒     51714
Name: label, dtype: int64

In [5]:
X, y = data.iloc[:, 1], data.iloc[:, 0]

X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.5, random_state=520)

In [6]:
X_train.shape

(180872,)

In [7]:
for i in [X_train, X_test, Y_train, Y_test]:
    i.index = range(i.shape[0])

In [8]:
le = LabelEncoder().fit(Y_train)

Y_train = le.transform(Y_train)
Y_test = le.transform(Y_test)

In [9]:
le.classes_

array(['低落', '厌恶', '喜悦', '愤怒'], dtype=object)

In [10]:
tfidf = TfidfVectorizer().fit(X_train)

X_train_ = tfidf.transform(X_train)
X_test_ = tfidf.transform(X_test)

In [11]:
mnb = MultinomialNB().fit(X_train_, Y_train)

In [12]:
mnb.score(X_train_, Y_train)

0.5861272059799195

In [13]:
mnb.score(X_test_, Y_test)

0.5675505329735946

In [14]:
cnb = ComplementNB().fit(X_train_, Y_train)

In [15]:
cnb.score(X_train_, Y_train)

0.7064719801848821

In [16]:
cnb.score(X_test_, Y_test)

0.5546519085320005

In [17]:
name = ["Multinomial", "Complement", "Bournulli"]

# 注意高斯朴素贝叶斯不接受稀疏矩阵
models = [MultinomialNB(), ComplementNB(), BernoulliNB()]
for name, clf in zip(name, models):
    clf.fit(X_train_, Y_train)
    y_pred = clf.predict(X_test_)
    proba = clf.predict_proba(X_test_)
    score = clf.score(X_test_, Y_test)
    print(name)
    
    # 4个不同的标签取值下的布里尔分数
    Bscore = []
    for i in range(len(np.unique(Y_train))):
        bs = BS(Y_test == i, proba[:, i], pos_label=1)
        Bscore.append(bs)
        print("\tBrier under {}:{:.3f}".format(le.classes_[i],bs))
        
    print("\tAverage Brier:{:.3f}".format(np.mean(Bscore)))
    print("\tAccuracy:{:.3f}".format(score))
    print("\n")

Multinomial
	Brier under 低落:0.128
	Brier under 厌恶:0.127
	Brier under 喜悦:0.239
	Brier under 愤怒:0.117
	Average Brier:0.153
	Accuracy:0.568


Complement
	Brier under 低落:0.129
	Brier under 厌恶:0.129
	Brier under 喜悦:0.243
	Brier under 愤怒:0.117
	Average Brier:0.155
	Accuracy:0.555


Bournulli
	Brier under 低落:0.162
	Brier under 厌恶:0.162
	Brier under 喜悦:0.319
	Brier under 愤怒:0.134
	Average Brier:0.194
	Accuracy:0.562




In [18]:
name = ["Multinomial"
       ,"Multinomial + Isotonic"
       ,"Multinomial + Sigmoid"
       ,"Complement"
       ,"Complement + Isotonic"
       ,"Complement + Sigmoid"
       ,"Bernoulli"
       ,"Bernoulli + Isotonic"
       ,"Bernoulli + Sigmoid"]

models = [MultinomialNB()
         ,CalibratedClassifierCV(MultinomialNB(), cv=5, method='isotonic')
         ,CalibratedClassifierCV(MultinomialNB(), cv=5, method='sigmoid')
         ,ComplementNB()
         ,CalibratedClassifierCV(ComplementNB(), cv=5, method='isotonic')
         ,CalibratedClassifierCV(ComplementNB(), cv=5, method='sigmoid')
         ,BernoulliNB()
         ,CalibratedClassifierCV(BernoulliNB(), cv=5, method='isotonic')
         ,CalibratedClassifierCV(BernoulliNB(), cv=5, method='sigmoid')
         ]

for name, clf in zip(name, models):
    clf.fit(X_train_, Y_train)
    y_pred = clf.predict(X_test_)
    proba = clf.predict_proba(X_test_)
    score = clf.score(X_test_, Y_test)
    print(name)
    
    # 4个不同的标签取值下的布里尔分数
    Bscore = []
    for i in range(len(np.unique(Y_train))):
        bs = BS(Y_test == i, proba[:, i], pos_label=1)
        Bscore.append(bs)
        print("\tBrier under {}:{:.3f}".format(le.classes_[i],bs))
        
    print("\tAverage Brier:{:.3f}".format(np.mean(Bscore)))
    print("\tAccuracy:{:.3f}".format(score))
    print("\n")

Multinomial
	Brier under 低落:0.128
	Brier under 厌恶:0.127
	Brier under 喜悦:0.239
	Brier under 愤怒:0.117
	Average Brier:0.153
	Accuracy:0.568


Multinomial + Isotonic
	Brier under 低落:0.124
	Brier under 厌恶:0.123
	Brier under 喜悦:0.207
	Brier under 愤怒:0.112
	Average Brier:0.142
	Accuracy:0.575


Multinomial + Sigmoid
	Brier under 低落:0.125
	Brier under 厌恶:0.124
	Brier under 喜悦:0.210
	Brier under 愤怒:0.113
	Average Brier:0.143
	Accuracy:0.575


Complement
	Brier under 低落:0.129
	Brier under 厌恶:0.129
	Brier under 喜悦:0.243
	Brier under 愤怒:0.117
	Average Brier:0.155
	Accuracy:0.555


Complement + Isotonic
	Brier under 低落:0.123
	Brier under 厌恶:0.122
	Brier under 喜悦:0.197
	Brier under 愤怒:0.110
	Average Brier:0.138
	Accuracy:0.579


Complement + Sigmoid
	Brier under 低落:0.125
	Brier under 厌恶:0.124
	Brier under 喜悦:0.198
	Brier under 愤怒:0.110
	Average Brier:0.139
	Accuracy:0.578


Bernoulli
	Brier under 低落:0.162
	Brier under 厌恶:0.162
	Brier under 喜悦:0.319
	Brier under 愤怒:0.134
	Average Brier:0.194
	Accurac

In [19]:
alpha_list = np.linspace(0.1, 1, 10)
print("Complement + Isotonic")

for alpha in alpha_list:
    clf = CalibratedClassifierCV(ComplementNB(alpha=alpha), cv=5, method='isotonic')
    clf.fit(X_train_, Y_train)
    y_pred = clf.predict(X_test_)
    proba = clf.predict_proba(X_test_)
    score = clf.score(X_test_, Y_test)
    
    # 4个不同的标签取值下的布里尔分数
    print(alpha)
    Bscore = []
    for i in range(len(np.unique(Y_train))):
        bs = BS(Y_test == i, proba[:, i], pos_label=1)
        Bscore.append(bs)
        print("\tBrier under {}:{:.3f}".format(le.classes_[i],bs))
        
    print("\tAverage Brier:{:.3f}".format(np.mean(Bscore)))
    print("\tAccuracy:{:.3f}".format(score))
    print("\n")

Complement + Isotonic
0.1
	Brier under 低落:0.123
	Brier under 厌恶:0.123
	Brier under 喜悦:0.191
	Brier under 愤怒:0.110
	Average Brier:0.137
	Accuracy:0.579


0.2
	Brier under 低落:0.123
	Brier under 厌恶:0.123
	Brier under 喜悦:0.191
	Brier under 愤怒:0.109
	Average Brier:0.136
	Accuracy:0.580


0.30000000000000004
	Brier under 低落:0.123
	Brier under 厌恶:0.122
	Brier under 喜悦:0.191
	Brier under 愤怒:0.109
	Average Brier:0.136
	Accuracy:0.581


0.4
	Brier under 低落:0.123
	Brier under 厌恶:0.122
	Brier under 喜悦:0.192
	Brier under 愤怒:0.109
	Average Brier:0.136
	Accuracy:0.581


0.5
	Brier under 低落:0.122
	Brier under 厌恶:0.122
	Brier under 喜悦:0.192
	Brier under 愤怒:0.109
	Average Brier:0.137
	Accuracy:0.581


0.6
	Brier under 低落:0.122
	Brier under 厌恶:0.122
	Brier under 喜悦:0.193
	Brier under 愤怒:0.109
	Average Brier:0.137
	Accuracy:0.581


0.7000000000000001
	Brier under 低落:0.122
	Brier under 厌恶:0.122
	Brier under 喜悦:0.194
	Brier under 愤怒:0.109
	Average Brier:0.137
	Accuracy:0.580


0.8
	Brier under 低落:0.123
	Bri

In [20]:
from sklearn.linear_model import LogisticRegression
logi = LogisticRegression(max_iter=100).fit(X_train_, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
logi.score(X_train_, Y_train)

0.6550765182007166

In [22]:
logi.score(X_test_, Y_test)

0.5811347251094697