#  一、数据集结构

In [93]:
# 使用github上的库（kaggle上的可能是旧版本）
import efaqa_corpus_zh
data = list(efaqa_corpus_zh.load())


In [94]:
num_post = len(data)
num_sentence = sum([len(post["chats"]) for post in data])
avg_num_word = sum(len(chat["value"]) for post in data for chat in post["chats"]) / num_sentence

print("帖子数量", num_post)
print("文本条数（不计title）", num_sentence)
print("帖均文本条数", num_sentence / num_post)
print("文本平均长度（不计title）/字", avg_num_word)

帖子数量 20000
文本条数（不计title） 207745
帖均文本条数 10.38725
文本平均长度（不计title）/字 17.68224987364317


In [95]:
# 数据集包含20000个样本
# 每个样本即为一个帖子，内含若干跟帖和相关信息
data[0]  # 展示一条数据

{'chats': [{'label': {'knowledge': False, 'negative': False, 'question': True},
   'sender': 'audience',
   'time': '11:02:45',
   'type': 'textMessage',
   'value': '这样的议论是针对谁呢？'},
  {'label': {'knowledge': False, 'negative': False, 'question': False},
   'sender': 'audience',
   'time': '11:08:38',
   'type': 'textMessage',
   'value': '我也是一个从小被这样训到大的女生哦，总会被指责缺心少肺、没心眼儿、没眼力见儿、看不出来眉眼高低等等。不过在我成长一段时间之后，发现这件事情其实很简单，也没有什么大的问题。如果你愿意的话，可以找我聊聊，倾诉一下你遇到的事情，希望能够帮到你。我是树洞小太阳，欢迎你来找我玩❤'},
  {'label': {'knowledge': False, 'negative': False, 'question': False},
   'sender': 'audience',
   'time': '11:15:17',
   'type': 'textMessage',
   'value': '好惨'},
  {'label': {'knowledge': False, 'negative': False, 'question': False},
   'sender': 'audience',
   'time': '11:15:35',
   'type': 'textMessage',
   'value': '原生家庭也这么对你吗'}],
 'date': '2020-03-02 11:01:08',
 'label': {'s1': '1.13', 's2': '2.7', 's3': '3.4'},
 'owner': '匿名',
 'title': '女 听过别人最多的议论就是干啥啥不行不长心眼没有脑子'}

In [37]:
# 每个帖子是一个字典对象，包含对话（跟帖）、发帖人、标题、hash码、心理状态标签（label）五个属性
# label标签是重要属性
print(type(data[0]))
data[0].keys()

<class 'dict'>


dict_keys(['chats', 'date', 'label', 'owner', 'title'])

In [38]:
# chats字段是一个列表，包含若干跟帖
print(type(data[0]["chats"]))

# 跟帖的结构是字典对象，包含跟帖时间、内容、发送者（是楼主还是其他用户）、内容类型（文本还是..）、标签（是否为问句，是否知识，是否为消极消息）
data[0]["chats"][0]

<class 'list'>


{'label': {'knowledge': False, 'negative': False, 'question': True},
 'sender': 'audience',
 'time': '11:02:45',
 'type': 'textMessage',
 'value': '这样的议论是针对谁呢？'}

In [39]:
# 文本类型是唯一的回帖类型
content_types = set([follow["type"] for lt in data for follow in lt["chats"] ])
content_types

{'textMessage'}

In [40]:
data_fields = set([tuple(chat.keys()) for post in data for chat in post["chats"]])
data_fields

{('label', 'sender', 'time', 'type', 'value')}

# 二、数据集描述性数据分析

In [41]:
# 心理状态标签
for post in data[:2]:
    print("------------------")
    for item in post["label"].items():
        print(item)

------------------
('s1', '1.13')
('s2', '2.7')
('s3', '3.4')
------------------
('s1', '1.16')
('s2', '2.7')
('s3', '3.4')


# 三、文本预处理

In [42]:
import jieba
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [43]:
stop_word_path = "stop_words.txt"
# 读取停用词
stopwords = [line.strip() for line in open(stop_word_path, 'r', encoding='utf-8').readlines()]

stopwords += [" ", "，", "。"]

len(stopwords)

2315

In [44]:
#  将每个帖子中的文本分离出来
lines = [[post["title"]] + [chat["value"] for chat in post["chats"]] for post in data]

In [45]:
lines[0]

['女 听过别人最多的议论就是干啥啥不行不长心眼没有脑子',
 '这样的议论是针对谁呢？',
 '我也是一个从小被这样训到大的女生哦，总会被指责缺心少肺、没心眼儿、没眼力见儿、看不出来眉眼高低等等。不过在我成长一段时间之后，发现这件事情其实很简单，也没有什么大的问题。如果你愿意的话，可以找我聊聊，倾诉一下你遇到的事情，希望能够帮到你。我是树洞小太阳，欢迎你来找我玩❤',
 '好惨',
 '原生家庭也这么对你吗']

In [46]:
import jieba
from functools import reduce
from tqdm import tqdm

In [47]:
# 分词
x = []
for cluster in tqdm(lines):
    x_line = []
    for line in cluster:
        tmp = [char for char in jieba.lcut(line) if char not in stopwords]
        x_line.append(tmp)
    x.append(reduce(lambda a, b: a+b, x_line))

100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [01:18<00:00, 256.33it/s]


In [48]:
x[0]

['女',
 '听过',
 '最多',
 '议论',
 '干',
 '不行',
 '长',
 '心眼',
 '脑子',
 '议论',
 '训到',
 '女生',
 '总会',
 '指责',
 '缺心少肺',
 '没',
 '心眼儿',
 '没',
 '眼力',
 '见儿',
 '看不出来',
 '眉眼高低',
 '成长',
 '一段时间',
 '发现',
 '这件',
 '事情',
 '简单',
 '找',
 '聊聊',
 '倾诉',
 '事情',
 '希望',
 '帮到',
 '树洞',
 '太阳',
 '找',
 '玩',
 '❤',
 '好惨',
 '原生',
 '家庭']

In [49]:
# 预测目标y（心理状态标签）
y_s1_raw = []
y_s2_raw = []
y_s3_raw = []

for post in tqdm(data):
    cluster = {item[0]: item[1] for item in post["label"].items()} 
    y_s1_raw.append(cluster["s1"])
    y_s2_raw.append(cluster["s2"])
    y_s3_raw.append(cluster["s3"])

100%|████████████████████████████████████████████████████████████████████████| 20000/20000 [00:00<00:00, 417507.78it/s]


In [50]:
y_map = {}
for label in y_s1_raw + y_s2_raw + y_s3_raw:
    if label not in y_map:
        y_map[label] = len(y_map)

In [51]:
y_map

{'1.13': 0,
 '1.16': 1,
 '1.6': 2,
 '1.9': 3,
 '1.14': 4,
 '1.7': 5,
 '1.12': 6,
 '1.3': 7,
 '1.15': 8,
 '1.8': 9,
 '1.2': 10,
 '1.1': 11,
 '1.10': 12,
 '1.11': 13,
 '1.4': 14,
 '1.5': 15,
 '1.18': 16,
 '1.17': 17,
 '1.19': 18,
 '2.7': 19,
 '2.1': 20,
 '2.2': 21,
 '2.8': 22,
 '2.3': 23,
 '2.4': 24,
 '2.5': 25,
 '2.6': 26,
 '3.4': 27,
 '3.2': 28,
 '3.3': 29,
 '3.6': 30,
 '3.5': 31}

In [52]:
y_s1 = [y_map[label] for label in y_s1_raw]
y_s2 = [y_map[label] for label in y_s2_raw]
y_s3 = [y_map[label] for label in y_s3_raw]

# 获得词向量(s1)

In [53]:
import gensim
from sklearn.model_selection import train_test_split

In [54]:
# 分割测试集和训练集
x_train, x_test, y_train, y_test \
        = train_test_split(x, y_s1, test_size=0.3)

In [55]:
# 获得词嵌入
word2vec_model = gensim.models.Word2Vec(x_train)

In [57]:
# 文档向量计算方法：词向量的加权平均
def get_doc_vec(x, word2vec_model):
    doc_vec_s1 = []
    zero_count = 0
    for doc in tqdm(x):
        tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
        if len(tmp) == 0:
            avg = np.zeros(len(doc_vec_s1[0]))
            zero_count += 1
        else:
            avg = [item/len(tmp) for item in reduce(lambda lt1, lt2: [lt1[index]+lt2[index] for index in range(len(lt1))], tmp)]
        doc_vec_s1.append(avg)
    print("零向量占比", zero_count/len(doc_vec_s1))
    return doc_vec_s1

In [58]:
doc_vec_s1 = get_doc_vec(x_train, word2vec_model)

  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|███████████████████████████████████████████████████████████████████████████| 14000/14000 [00:29<00:00, 467.35it/s]

零向量占比 7.142857142857143e-05





In [59]:
len(doc_vec_s1)

14000

In [60]:
len(doc_vec_s1[0])

100

# 训练模型（s1）

In [61]:
from sklearn.svm import SVC  # 支持向量机分类器

In [62]:
svm_model = SVC()

In [63]:
svm_model.fit(doc_vec_s1, y_train)

In [64]:
predicted = svm_model.predict(get_doc_vec(x_test, word2vec_model))

  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|█████████████████████████████████████████████████████████████████████████████| 6000/6000 [00:09<00:00, 613.25it/s]


零向量占比 0.0


In [65]:
from sklearn import metrics  # 模型评价工具
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.53      0.07      0.12       146
           1       0.57      0.82      0.67      2133
           2       0.68      0.31      0.42       123
           3       0.61      0.84      0.71      1524
           4       0.00      0.00      0.00        55
           5       0.76      0.06      0.11       260
           6       0.00      0.00      0.00       131
           7       0.50      0.46      0.48       589
           8       0.00      0.00      0.00        65
           9       0.62      0.05      0.10       249
          10       0.56      0.17      0.27        80
          11       0.62      0.49      0.55       177
          12       0.22      0.02      0.03       131
          13       1.00      0.00      0.01       223
          14       0.00      0.00      0.00        32
          15       0.00      0.00      0.00        12
          16       0.00      0.00      0.00        27
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [66]:
# 换模型
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [67]:
rf.fit(doc_vec_s1, y_train)
predicted = rf.predict(get_doc_vec(x_test, word2vec_model))
print(metrics.classification_report(y_test, predicted))

  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|█████████████████████████████████████████████████████████████████████████████| 6000/6000 [00:09<00:00, 648.52it/s]


零向量占比 0.0
              precision    recall  f1-score   support

           0       0.42      0.13      0.20       146
           1       0.57      0.82      0.67      2133
           2       0.67      0.29      0.41       123
           3       0.60      0.81      0.69      1524
           4       0.00      0.00      0.00        55
           5       0.53      0.10      0.17       260
           6       0.00      0.00      0.00       131
           7       0.51      0.41      0.46       589
           8       0.00      0.00      0.00        65
           9       0.47      0.12      0.19       249
          10       0.76      0.16      0.27        80
          11       0.58      0.39      0.47       177
          12       0.43      0.02      0.04       131
          13       0.69      0.08      0.14       223
          14       0.00      0.00      0.00        32
          15       0.00      0.00      0.00        12
          16       0.00      0.00      0.00        27
          17     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 训练模型（s2）

In [68]:
# 分割测试集和训练集
x_train, x_test, y_train, y_test \
        = train_test_split(x, y_s2, test_size=0.3)

In [69]:
doc_vec_s2 = get_doc_vec(x_train, word2vec_model)
svm_model = SVC()
svm_model.fit(doc_vec_s2, y_train)
predicted = svm_model.predict(get_doc_vec(x_test, word2vec_model))

  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|███████████████████████████████████████████████████████████████████████████| 14000/14000 [00:21<00:00, 659.05it/s]


零向量占比 0.0


  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|█████████████████████████████████████████████████████████████████████████████| 6000/6000 [00:09<00:00, 616.59it/s]


零向量占比 0.00016666666666666666


In [70]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

          19       0.89      1.00      0.94      5344
          20       0.00      0.00      0.00       283
          21       0.00      0.00      0.00       199
          22       0.00      0.00      0.00        51
          23       0.00      0.00      0.00        32
          24       0.00      0.00      0.00        72
          25       0.00      0.00      0.00        13
          26       0.00      0.00      0.00         6

    accuracy                           0.89      6000
   macro avg       0.11      0.12      0.12      6000
weighted avg       0.79      0.89      0.84      6000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 训练模型（s3）¶

In [71]:
# 分割测试集和训练集
x_train, x_test, y_train, y_test \
        = train_test_split(x, y_s3, test_size=0.3)

In [72]:
doc_vec_s3 = get_doc_vec(x_train, word2vec_model)
svm_model = SVC()
svm_model.fit(doc_vec_s3, y_train)
predicted = svm_model.predict(get_doc_vec(x_test, word2vec_model))

  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|███████████████████████████████████████████████████████████████████████████| 14000/14000 [00:22<00:00, 622.86it/s]


零向量占比 7.142857142857143e-05


  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|█████████████████████████████████████████████████████████████████████████████| 6000/6000 [00:09<00:00, 620.70it/s]


零向量占比 0.0


In [73]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

          27       0.98      1.00      0.99      5879
          28       0.00      0.00      0.00        76
          29       0.00      0.00      0.00        28
          30       0.00      0.00      0.00        16
          31       0.00      0.00      0.00         1

    accuracy                           0.98      6000
   macro avg       0.20      0.20      0.20      6000
weighted avg       0.96      0.98      0.97      6000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 转化为二分类来改进

In [74]:
y_s3_bi_map = [0 if item is 27 else 1 for item in y_s3]

  y_s3_bi_map = [0 if item is 27 else 1 for item in y_s3]


In [75]:
# 分割测试集和训练集
x_train, x_test, y_train, y_test \
        = train_test_split(x, y_s3_bi_map, test_size=0.3)

In [76]:
doc_vec_s3 = get_doc_vec(x_train, word2vec_model)
svm_model = SVC()
svm_model.fit(doc_vec_s3, y_train)
predicted = svm_model.predict(get_doc_vec(x_test, word2vec_model))

  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|███████████████████████████████████████████████████████████████████████████| 14000/14000 [00:23<00:00, 606.81it/s]


零向量占比 0.0


  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|█████████████████████████████████████████████████████████████████████████████| 6000/6000 [00:09<00:00, 608.14it/s]


零向量占比 0.00016666666666666666


In [77]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5887
           1       0.00      0.00      0.00       113

    accuracy                           0.98      6000
   macro avg       0.49      0.50      0.50      6000
weighted avg       0.96      0.98      0.97      6000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [78]:
rf_model = RandomForestClassifier()
rf_model.fit(doc_vec_s3, y_train)
predicted = rf_model.predict(get_doc_vec(x_test, word2vec_model))
print(metrics.classification_report(y_test, predicted))

  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|█████████████████████████████████████████████████████████████████████████████| 6000/6000 [00:11<00:00, 519.12it/s]


零向量占比 0.00016666666666666666
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      5887
           1       0.00      0.00      0.00       113

    accuracy                           0.98      6000
   macro avg       0.49      0.50      0.50      6000
weighted avg       0.96      0.98      0.97      6000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 均衡规模

In [79]:
x_train_0 = [x_train[i] for i in range(len(x_train)) if y_train[i] is 0]
x_train_1 = [x_train[i] for i in range(len(x_train)) if y_train[i] is 1]

  x_train_0 = [x_train[i] for i in range(len(x_train)) if y_train[i] is 0]
  x_train_1 = [x_train[i] for i in range(len(x_train)) if y_train[i] is 1]


In [80]:
x_train_0 = [x_train[i] for i in range(len(x_train)) if y_train[i] is 0]
x_train_1 = [x_train[i] for i in range(len(x_train)) if y_train[i] is 1]
print(len(x_train_0))
print(len(x_train_1))

13725
275


In [81]:
import random
x_train_0_balanced = random.sample(x_train_0, len(x_train_1))

In [82]:
print(len(x_train_0_balanced))
print(len(x_train_1))

275
275


In [83]:
# 合成新的x_train和y_train
tuples = [(item, 0) for item in x_train_0_balanced] + [(item, 1) for item in x_train_1]
random.shuffle(tuples)  # 打乱顺序
x_train = [item[0] for item in tuples]
y_train = [item[1] for item in tuples]

print(len(x_train))
print(len(y_train))

550
550


In [84]:
rf_model = RandomForestClassifier()
rf_model.fit(get_doc_vec(x_train, word2vec_model), y_train)
predicted = rf_model.predict(get_doc_vec(x_test, word2vec_model))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:01<00:00, 466.95it/s]


零向量占比 0.0


  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|█████████████████████████████████████████████████████████████████████████████| 6000/6000 [00:10<00:00, 597.24it/s]


零向量占比 0.00016666666666666666
              precision    recall  f1-score   support

           0       0.99      0.70      0.82      5887
           1       0.05      0.81      0.09       113

    accuracy                           0.70      6000
   macro avg       0.52      0.75      0.46      6000
weighted avg       0.98      0.70      0.80      6000



In [85]:
doc_vec_s3 = get_doc_vec(x_train, word2vec_model)
svm_model = SVC()
svm_model.fit(doc_vec_s3, y_train)
predicted = svm_model.predict(get_doc_vec(x_test, word2vec_model))

  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|███████████████████████████████████████████████████████████████████████████████| 550/550 [00:00<00:00, 567.36it/s]


零向量占比 0.0


  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
  tmp = [word2vec_model[word] for word in doc if word in word2vec_model]
100%|█████████████████████████████████████████████████████████████████████████████| 6000/6000 [00:09<00:00, 633.80it/s]


零向量占比 0.00016666666666666666


In [86]:
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

              precision    recall  f1-score   support

           0       0.99      0.68      0.81      5887
           1       0.05      0.81      0.09       113

    accuracy                           0.69      6000
   macro avg       0.52      0.75      0.45      6000
weighted avg       0.98      0.69      0.80      6000

