In [1]:
import json
import math
import torch
import pickle
import transformers

import numpy as np
import pandas as pd

In [2]:
from pathlib import Path
from itertools import chain
from tqdm import tqdm

In [3]:
from sklearn import metrics
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from transformers import BertTokenizer
from transformers import BertModel

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

## Data

In [5]:
def get_basic(p):
    with p.open() as f:
        d = json.load(f)
        d['标签'] = d['标签'].keys()
        return d

In [6]:
def get_comment(d, tag, comment):
    comment['标签'] = tag
    del comment['回复']
    comment.update(d)
    return comment

In [7]:
def get_comments(p):
    with p.open() as f:
        d = json.load(f)
        tag_comments = d['标签']
        del d['标签']
        return [get_comment(d, tag, comment) for tag, comments in tag_comments.items() for comment in comments]

### load

In [8]:
path_root = Path('./data')

In [9]:
path_data = Path('comment.taptap-20210203-1')

In [10]:
path = path_root/path_data

In [11]:
%time df_basic = pd.DataFrame([get_basic(p) for p in path.glob('*.json')])
print(df_basic.shape)
df_basic.head(1)

CPU times: user 10.3 s, sys: 2.76 s, total: 13.1 s
Wall time: 13.1 s
(150, 5)


Unnamed: 0,游戏名,游戏评分,游戏url,评论数量,标签
0,精灵契约,6.8,https://www.taptap.com/app/142111,870,"(过于氪金, 体验不错, 画面优良, 运营不足, 有趣好玩, 玩家互动多, 抽卡概率低, 厂..."


In [12]:
%time df_comments = pd.DataFrame(chain.from_iterable([get_comments(p) for p in tqdm(path.glob('*.json'))]))
print(df_comments.shape)
df_comments.head(1)

150it [00:14, 10.60it/s]


CPU times: user 13.7 s, sys: 2.89 s, total: 16.6 s
Wall time: 16.5 s
(488452, 14)


Unnamed: 0,用户名,评论时间,游戏评分,游戏时长,内容,手机型号,欢乐,点赞,点踩,回复量,标签,游戏名,游戏url,评论数量
0,Foo云少,2021-01-19 10:07:37,6.8,0,不好玩太氪金了我有段时间没有玩号在那个区都不知道了而且这个游戏刚刚出来的时候玩的10区的,华为畅享9 Plus,0,0,0,0,过于氪金,精灵契约,https://www.taptap.com/app/142111,870


In [13]:
df_comments_topic = df_comments[~df_comments.标签.isin(['好评', '中评', '差评', 'android', 'ios', 'web', '有游戏时长'])]
print(df_comments_topic.shape)
df_comments_topic.head(1)

(45450, 14)


Unnamed: 0,用户名,评论时间,游戏评分,游戏时长,内容,手机型号,欢乐,点赞,点踩,回复量,标签,游戏名,游戏url,评论数量
0,Foo云少,2021-01-19 10:07:37,6.8,0,不好玩太氪金了我有段时间没有玩号在那个区都不知道了而且这个游戏刚刚出来的时候玩的10区的,华为畅享9 Plus,0,0,0,0,过于氪金,精灵契约,https://www.taptap.com/app/142111,870


In [14]:
%time df_comments_topics = df_comments_topic[['内容', '标签']].groupby(['内容']).agg(list).reset_index()
print(df_comments_topics.shape)
df_comments_topics.head(3)

CPU times: user 1 s, sys: 8 ms, total: 1.01 s
Wall time: 1.01 s
(30378, 2)


Unnamed: 0,内容,标签
0,\n\n\n玩过两个赛季的人，游戏消遣还不错，不过卡池锁卡劝退，一赛季初期获得刘备张飞，二赛...,[运营不足]
1,\n\n10:30开始进游戏就各种bug，各种系统维护。下午玩了不到一个小时，服务器又崩。无...,[运营不足]
2,"\n\n游戏模式还是挺有意思，可以自己铺路建设,把不同文明的帝国从几个人发展到上万人建成自己...","[体验不错, 有趣好玩]"


In [15]:
df_comments_topics['标签'].apply(len).value_counts()

1    20056
2     7069
3     2228
4      694
5      222
6       83
7       20
8        6
Name: 标签, dtype: int64

In [16]:
df_comments_topics[df_comments_topics['标签'].apply(len) > 1].head(3)

Unnamed: 0,内容,标签
2,"\n\n游戏模式还是挺有意思，可以自己铺路建设,把不同文明的帝国从几个人发展到上万人建成自己...","[体验不错, 有趣好玩]"
4,\n\n玩了差不多一个月了，这两天游戏里的汉家松鼠让我评论，那么我就来评一评。游戏立意不错，...,"[体验不错, 值得花钱, 厂商良心]"
5,\n\n这款游戏整体来说挺好的，剧情画风都挺不错，但是还是有一些问题。\n①难度设置的有些不...,"[剧情丰富, 画面优良]"


### eda

#### content length

In [17]:
df_comments_length = df_comments_topics['内容'].apply(len)

In [18]:
df_comments_length.describe()

count    30378.000000
mean       183.025446
std        286.144866
min         21.000000
25%         44.000000
50%         87.000000
75%        203.000000
max      13714.000000
Name: 内容, dtype: float64

#### topic

In [19]:
tags = set(chain.from_iterable(df_basic.标签.apply(list).tolist()))
print(len(tags))
tags

50


{'IP还原差',
 'UI体验好',
 'UI体验差',
 'android',
 'ios',
 'web',
 '上手难度大',
 '中评',
 '优化相关',
 '体验不错',
 '体验较差',
 '值得花钱',
 '剧情丰富',
 '剧情单调',
 '厂商不给力',
 '厂商良心',
 '太肝了',
 '好评',
 '尊重原著',
 '差评',
 '平衡性好',
 '平衡性差',
 '广告太多',
 '广告影响小',
 '抄袭嫌疑',
 '护肝',
 '抽卡概率低',
 '抽卡概率高',
 '操作简单',
 '操作麻烦',
 '新手友好',
 '有创新',
 '有游戏时长',
 '有趣好玩',
 '玩家互动多',
 '玩家互动少',
 '玩法较差',
 '画面优良',
 '画面粗糙',
 '福利好',
 '福利差',
 '自由度低',
 '自由度高',
 '过于氪金',
 '运营不足',
 '运营给力',
 '配置要求低',
 '配置要求高',
 '音效很棒',
 '音效较差'}

In [20]:
df_comments.drop_duplicates('内容').shape

(182070, 14)

In [21]:
df_comments_topics.标签.apply(len).value_counts()

1    20056
2     7069
3     2228
4      694
5      222
6       83
7       20
8        6
Name: 标签, dtype: int64

#### +1/0/-1

In [22]:
df_comments_pnn = df_comments[df_comments.标签.isin(['好评', '中评', '差评'])]
print(df_comments_pnn.shape)
df_comments_pnn.head(1)

(195017, 14)


Unnamed: 0,用户名,评论时间,游戏评分,游戏时长,内容,手机型号,欢乐,点赞,点踩,回复量,标签,游戏名,游戏url,评论数量
1369,白鳞小蛇,2021-01-26 11:37:59,6.8,0,还不错，就是太依赖于抽卡，没什么英雄搭配\r\n\r\n这游戏凉了么？\r\n,,1,0,0,0,好评,精灵契约,https://www.taptap.com/app/142111,870


In [23]:
df_comments_pnn.drop_duplicates('内容').shape

(173610, 14)

In [24]:
df_comments_pnn.标签.value_counts(normalize=True)

好评    0.591400
差评    0.291764
中评    0.116836
Name: 标签, dtype: float64

### save

In [25]:
%store df_comments
%store df_comments_length
%store df_comments_topics

Stored 'df_comments' (DataFrame)
Stored 'df_comments_length' (Series)
Stored 'df_comments_topics' (DataFrame)


## Ref

save var in notebook <br>
TODO: why del in desc

[1] https://stackoverflow.com/questions/35935670/share-variables-between-different-jupyter-notebooks