In [1]:
import re
import sys
import json
import jieba
import pickle
import logging
import unicodedata

import numpy as np
import pandas as pd
import lightgbm as lgb
import matplotlib.pyplot as plt

from time import time
from tqdm import tqdm
from pathlib import Path
from functools import partial
from itertools import chain
from itertools import zip_longest
from scipy.sparse import csr_matrix
from collections import namedtuple

from sklearn.datasets import fetch_20newsgroups

from optparse import OptionParser
from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2

from sklearn.linear_model import RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, ComplementNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn.utils.extmath import density

In [2]:
%matplotlib inline

In [3]:
pd.set_option('max_rows', 10**5)
pd.set_option('display.max_colwidth', 10**5)

In [4]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

In [5]:
def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')

In [6]:
op = OptionParser()

op.add_option("--report",
              action="store_true", dest="print_report", default=True,
              help="Print a detailed classification report.")

op.add_option("--chi2_select",
              action="store", type="int", dest="select_chi2",
              help="Select some number of features using a chi-squared test")

op.add_option("--confusion_matrix",
              action="store_true", dest="print_cm",
              help="Print the confusion matrix.")

op.add_option("--top10",
              action="store_true", dest="print_top10",
              help="Print ten most discriminative terms per class for every classifier.")

op.add_option("--use_hashing",
              action="store_true",
              help="Use a hashing vectorizer.")

op.add_option("--n_features",
              action="store", type=int, default=2 ** 16,
              help="n_features when using the hashing vectorizer.")

<Option at 0x7f1e53a8a220: --n_features>

In [7]:
# work-around for Jupyter notebook and IPython console

# opts, an object containing values for all of your options
#     e.g. if --file takes a single string argument, then options.file will be the filename supplied by the user, 
#     or None if the user did not supply that option
# args, the list of positional arguments leftover after parsing options

argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)
opts

<Values at 0x7f1e53a8a910: {'print_report': True, 'select_chi2': None, 'print_cm': None, 'print_top10': None, 'use_hashing': None, 'n_features': 65536}>

In [8]:
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

print(__doc__)
op.print_help()
print()

Automatically created module for IPython interactive environment
Usage: ipykernel_launcher.py [options]

Options:
  -h, --help            show this help message and exit
  --report              Print a detailed classification report.
  --chi2_select=SELECT_CHI2
                        Select some number of features using a chi-squared
                        test
  --confusion_matrix    Print the confusion matrix.
  --top10               Print ten most discriminative terms per class for
                        every classifier.
  --use_hashing         Use a hashing vectorizer.
  --n_features=N_FEATURES
                        n_features when using the hashing vectorizer.



# Data

## load

In [9]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

In [10]:
def get_basic(p):
    with p.open() as f:
        d = json.load(f)
        d['标签'] = d['标签'].keys()
        return d

In [11]:
def get_comment(d, tag, comment):
    comment['标签'] = tag
    del comment['回复']
    comment.update(d)
    return comment

In [12]:
def get_comments(p):
    with p.open() as f:
        d = json.load(f)
        tag_comments = d['标签']
        del d['标签']
        return [get_comment(d, tag, comment) for tag, comments in tag_comments.items() for comment in comments]

In [13]:
path_root = Path('./data')

In [14]:
path_data = Path('comment.taptap-20210203-1')

In [15]:
path = path_root/path_data

In [16]:
%time df_basic = pd.DataFrame([get_basic(p) for p in path.glob('*.json')])
print(df_basic.shape)
df_basic.head(1)

CPU times: user 13.5 s, sys: 3.29 s, total: 16.8 s
Wall time: 16.8 s
(150, 5)


Unnamed: 0,游戏名,游戏评分,游戏url,评论数量,标签
0,精灵契约,6.8,https://www.taptap.com/app/142111,870,"(过于氪金, 体验不错, 画面优良, 运营不足, 有趣好玩, 玩家互动多, 抽卡概率低, 厂商不给力, 平衡性差, 新手友好, 抄袭嫌疑, web, ios, android, 有游戏时长, 好评, 中评, 差评)"


In [17]:
%time df_comments = pd.DataFrame(chain.from_iterable([get_comments(p) for p in tqdm(path.glob('*.json'))]))
print(df_comments.shape)
df_comments.head(1)

150it [00:17,  8.74it/s]


CPU times: user 17.4 s, sys: 3.12 s, total: 20.5 s
Wall time: 20.5 s
(488452, 14)


Unnamed: 0,用户名,评论时间,游戏评分,游戏时长,内容,手机型号,欢乐,点赞,点踩,回复量,标签,游戏名,游戏url,评论数量
0,Foo云少,2021-01-19 10:07:37,6.8,0,不好玩太氪金了我有段时间没有玩号在那个区都不知道了而且这个游戏刚刚出来的时候玩的10区的,华为畅享9 Plus,0,0,0,0,过于氪金,精灵契约,https://www.taptap.com/app/142111,870


In [18]:
df_comments_topic = df_comments[~df_comments.标签.isin(['好评', '中评', '差评'])]
print(df_comments_topic.shape)
df_comments_topic.head(1)

(293435, 14)


Unnamed: 0,用户名,评论时间,游戏评分,游戏时长,内容,手机型号,欢乐,点赞,点踩,回复量,标签,游戏名,游戏url,评论数量
0,Foo云少,2021-01-19 10:07:37,6.8,0,不好玩太氪金了我有段时间没有玩号在那个区都不知道了而且这个游戏刚刚出来的时候玩的10区的,华为畅享9 Plus,0,0,0,0,过于氪金,精灵契约,https://www.taptap.com/app/142111,870


In [19]:
%time df_comments_topics = df_comments_topic[['内容', '标签']].groupby(['内容']).agg(list).reset_index()
print(df_comments_topics.shape)
df_comments_topics.head(1)

CPU times: user 7.93 s, sys: 92 ms, total: 8.02 s
Wall time: 8.01 s
(175131, 2)


Unnamed: 0,内容,标签
0,\t 我看大家都没人说魂器，就我上次测试的经验来看，魂器其实也是很重要的，好的魂器搭配上合适的武将，效果绝对是1+1＞2的。\n官方也会建议一些武将搭配特定属性的魂器，比如奶妈就是回复，菜刀就是物理输出或者连携输出魂器，法师当然就是法器。\n我觉得有几个魂器是一定要有的：\n墨家礼装，减少对方输出的，这个基本是队伍辅助必备的，还有月下美人图，强力控制。物理的话，很多了，金箍棒，倚天剑，但是都需要武将来炼化，这个就看你怎么抉择了。哈哈，舍不得孩子套不着狼嘛～如果你搭配的好，真的是有控制，有输出，有回复，能组合出非常完美的搭配，就暂且说这么多吧。,[android]


## analysis

### topic

In [20]:
tags = set(chain.from_iterable(df_basic.标签.apply(list).tolist()))
print(len(tags))
tags

50


{'IP还原差',
 'UI体验好',
 'UI体验差',
 'android',
 'ios',
 'web',
 '上手难度大',
 '中评',
 '优化相关',
 '体验不错',
 '体验较差',
 '值得花钱',
 '剧情丰富',
 '剧情单调',
 '厂商不给力',
 '厂商良心',
 '太肝了',
 '好评',
 '尊重原著',
 '差评',
 '平衡性好',
 '平衡性差',
 '广告太多',
 '广告影响小',
 '抄袭嫌疑',
 '护肝',
 '抽卡概率低',
 '抽卡概率高',
 '操作简单',
 '操作麻烦',
 '新手友好',
 '有创新',
 '有游戏时长',
 '有趣好玩',
 '玩家互动多',
 '玩家互动少',
 '玩法较差',
 '画面优良',
 '画面粗糙',
 '福利好',
 '福利差',
 '自由度低',
 '自由度高',
 '过于氪金',
 '运营不足',
 '运营给力',
 '配置要求低',
 '配置要求高',
 '音效很棒',
 '音效较差'}

In [21]:
df_comments.drop_duplicates('内容').shape

(182070, 14)

In [22]:
df_comments_topics.标签.apply(len).value_counts()

1       105467
2        52361
3        10861
4         3874
5         1363
6          483
7          250
8           93
9           56
13          31
10          25
11          24
15          18
12          18
16          14
14          14
17          10
18           9
20           8
21           7
19           7
25           6
22           5
29           5
28           4
24           4
31           4
33           4
26           4
44           3
36           3
43           3
32           3
46           3
47           3
49           3
23           3
30           3
53           3
52           2
117          2
100          2
60           2
42           2
40           2
27           2
38           2
35           2
277          1
267          1
263          1
139          1
153          1
142          1
151          1
655          1
272          1
152          1
2453         1
122          1
37           1
157          1
64           1
324          1
69           1
73           1
78        

### +1/0/-1

In [23]:
df_comments_pnn = df_comments[df_comments.标签.isin(['好评', '中评', '差评'])]
print(df_comments_pnn.shape)
df_comments_pnn.head(1)

(195017, 14)


Unnamed: 0,用户名,评论时间,游戏评分,游戏时长,内容,手机型号,欢乐,点赞,点踩,回复量,标签,游戏名,游戏url,评论数量
1369,白鳞小蛇,2021-01-26 11:37:59,6.8,0,还不错，就是太依赖于抽卡，没什么英雄搭配\r\n\r\n这游戏凉了么？\r\n,,1,0,0,0,好评,精灵契约,https://www.taptap.com/app/142111,870


In [24]:
df_comments_pnn.drop_duplicates('内容').shape

(173610, 14)

In [27]:
df_comments_pnn.标签.value_counts(normalize=True)

2021-02-03 15:50:59,989 INFO NumExpr defaulting to 8 threads.


好评    0.591400
差评    0.291764
中评    0.116836
Name: 标签, dtype: float64

## train & test

In [25]:
data_test = pd.read_excel('/home/wangyh/project/document_cluster/data/dataset_ads-20210120-1-labeled.xlsx')
data_test = data_test[['label', 'content']]
data_test.head(3)

FileNotFoundError: [Errno 2] No such file or directory: '/home/wangyh/project/document_cluster/data/dataset_ads-20210120-1-labeled.xlsx'

In [None]:
data_train.loc[data_train.label.isin([1, 9, 24]), 'label'] = -1   # ads
data_train.loc[~(data_train.label==-1), 'label'] = 1

data_train.label.value_counts(normalize=True)

In [None]:
data_test.loc[data_test.label.isin([1, 9, 24]), 'label'] = -1
data_test.loc[~(data_test.label==-1), 'label'] = 1
data_test.label.value_counts(normalize=True)

In [None]:
X_train, X_test = data_train.content, data_test.content
y_train, y_test = data_train.label.tolist(), data_test.label.tolist()

In [None]:
data_train_size_mb = size_mb(X_train)
data_test_size_mb = size_mb(X_test)

print("%d documents - %0.3fMB (training set)" % (len(X_train), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (len(X_test), data_test_size_mb))
print()

## preprocess

### replace

### split util

#### split location

#### split terminology

#### split coordinates

#### split num + char

#### convert num

#### convert num + char

#### split char, num, chinese + special

#### split naive

#### split once

#### stop words

### split 1

### high freq

### low freq

### split 2

### split test

## feature

# Classify

## score

## efficiency

## tune