In [1]:
import json
import random
import time
import numpy as np
import pandas as pd
import jieba
import re
import datetime
from sklearn.model_selection import train_test_split

# 读取数据

In [2]:
comments = []
with open('../data/comment_with_entity.json', 'r') as f:
    for line in f:
        comments.append(json.loads(line.strip()))
movies = []
with open('../data/movie_with_entity.json', 'r') as f:
    for line in f:
        movies.append(json.loads(line.strip()))

In [3]:
len(movies), len(comments)

(2776, 1102111)

In [4]:
len(movies), len(comments)

(2776, 1102111)

PER，LOC，ORG

# 数据简单清洗去重

## 评论去重

In [4]:
comment_set = set()
comment_clean = []
for comment in comments:
    if comment['comment_text'] is None:
        continue
    unique_hash_comment = str(comment['comment_text']) + str(comment['user_url'])
    if unique_hash_comment in comment_set:
        continue
    comment_clean.append(comment)
    comment_set.add(unique_hash_comment)

In [5]:
movie_url_set = set()
for comment in comment_clean:
    movie_url_set.add(comment['movie_url'])

## 电影去重

In [6]:
have_url_set = set()
movie_name_set = set()
movie_clean = []
for movie in movies:

    if movie['movie_url'] in movie_url_set and movie['movie_url'] not in have_url_set and movie['title'] not in movie_name_set:
        movie_clean.append(movie)
        have_url_set.add(movie['movie_url'])
        movie_name_set.add(movie['title'])
        if(u'魔鬼悍将' in movie['title']):
            print(movie)

In [7]:
len(movie_clean), len(movie_url_set)

(2776, 2776)

In [8]:
len(comments), len(comment_clean)

(1102111, 1102111)

## 重新写入

In [9]:
with open('../data/comments_clean.json', 'w', encoding='utf-8') as f:
    for comment in comment_clean:
        f.write(json.dumps(comment, ensure_ascii=False))
        f.write('\n')
        
with open('../data/movies_clean.json', 'w', encoding='utf-8') as f:
    for movie in movie_clean:
        f.write(json.dumps(movie, ensure_ascii=False))
        f.write('\n')

# 样本构造

In [5]:
mp_movie_comments = {}
for comment in comments:
    comment_list = mp_movie_comments.get(comment['movie_url'], 0)
    if(comment_list is 0):
        mp_movie_comments[comment['movie_url']] = [comment,]
    else:
        mp_movie_comments[comment['movie_url']].append(comment)

In [6]:
samples = []
sample_movie_num = 2
sample_comment_num = 32
use_movie_comment_num = 16
cnt = 1
start = time.time()
for movie in movies:
    movie_url = movie['movie_url']
    if(mp_movie_comments.get(movie_url, 0) is 0):
        continue
    other_movies = random.sample(movies, sample_movie_num)
    other_comments = []
    for other_movie in other_movies:
        other_url = other_movie['movie_url']
        if other_url == movie_url:
            continue
        if(mp_movie_comments.get(other_url, 0) is 0):
            continue
        other_comments.extend(mp_movie_comments[other_url][-sample_comment_num:])
    for comment in mp_movie_comments[movie_url][:use_movie_comment_num]:
        for other_comment in other_comments:
            a_sample = {}
            a_sample['movie'] = movie
            a_sample['right_comment'] = comment
            a_sample['wrong_comment'] = other_comment
            samples.append(a_sample)
    cnt += 1
    if(cnt % 50 == 0):
        print(cnt, time.time() - start)
print(time.time() - start)

50 0.048156023025512695
100 1.157289981842041
150 1.1889243125915527
200 1.220710039138794
250 1.2514405250549316
300 1.282799482345581
350 1.3131978511810303
400 1.3434629440307617
450 1.3742926120758057
500 1.4060022830963135
550 1.436755657196045
600 1.4677681922912598
650 1.4988353252410889
700 2.6190783977508545
750 2.6506636142730713
800 2.6814792156219482
850 2.711899518966675
900 2.742671251296997
950 2.773435354232788
1000 2.803821563720703
1050 2.834181308746338
1100 2.8642959594726562
1150 2.8928608894348145
1200 2.922450542449951
1250 2.953110694885254
1300 2.98323130607605
1350 3.0128819942474365
1400 3.0426251888275146
1450 4.326707363128662
1500 4.356860160827637
1550 4.387431859970093
1600 4.418727397918701
1650 4.44938850402832
1700 4.48021388053894
1750 4.511022090911865
1800 4.5416858196258545
1850 4.5721635818481445
1900 4.603025674819946
1950 4.633539438247681
2000 4.664257526397705
2050 4.695501089096069
2100 4.725247859954834
2150 4.754871606826782
2200 4.7851212

# 特征工程

In [7]:
len(samples)

2842112

In [8]:
samples[2781]

{'movie': {'score': '7.6',
  'title': '\n钢铁侠2 Iron Man 2\n(2010)\n',
  'summary': '钢铁侠托尼·斯塔克（小罗伯特·唐尼 Robert Downey Jr. 饰）在国会听证上拒绝交出最新技术。与此同时，他发现胸口的微型电弧反应炉正迅速造成血液的钯金属中毒。沮丧的托尼将斯塔克公司的总裁职务交予了秘书波兹（格温妮丝·帕特罗 Gwyneth Paltrow 饰），由她全权负责正在进行的纽约斯塔克博览会。波兹从法律部门调来助理娜塔莉（斯佳丽·约翰逊 Scarlett Johansson 饰）照顾托尼。托尼在媒体前的高调亮相引起了其父当年同事的儿子，伊凡（米基·洛克 Mickey Rourke 饰）的不满。为了实施报复，他子承父业，研制出了一套可与钢铁战衣相媲美的装备。伊凡的技术引起了托尼的竞争对手，军火商贾斯丁·汉默（山姆·洛克威尔 Sam Rockwell 饰）的注意，他设法将伊凡劫持出狱，秘密研究取代钢铁侠。正当托尼苦于钯金属中毒造成的失...',
  'directors': ['乔恩·费儒'],
  'writer': ['贾斯汀·塞洛克斯'],
  'actor': ['小罗伯特·唐尼',
   '格温妮斯·帕特洛',
   '米基·洛克',
   '斯嘉丽·约翰逊',
   '山姆·洛克威尔',
   '唐·钱德尔',
   '塞缪尔·杰克逊',
   '乔恩·费儒',
   '保罗·贝坦尼',
   '克拉克·格雷格',
   '凯特·玛拉',
   '约翰·斯拉特里'],
  'country': ' 美国',
  'minutes': '124 分钟',
  'type': ['动作', '科幻', '冒险'],
  'releasedDate': ['2010-05-07(中国大陆/美国)'],
  'recommended_urls': ['https://movie.douban.com/subject/1432146/',
   'https://movie.douban.com/subject/1432146/',
   'https://movie.douban.com/subject/10741834/',
   'https://movie.douban.com

## 工具函数

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

def build_tfidf_svd_matrix(texts, n_output):
    """
    """
    corpus = []
    for text in texts:
        words = word_segment(str(text))
        use_words = []
        for word in words:
            if word in stop_words:
                continue
            else:
                use_words.append(word)
        corpus.append(' '.join(use_words))
    tfidf_vec = TfidfVectorizer()
    tfidf_matrix = tfidf_vec.fit_transform(corpus)
    svd = TruncatedSVD(n_components=n_output, n_iter=7, random_state=42)
    tf_idf_svd = svd.fit_transform(tfidf_matrix)
    
    return tf_idf_svd, tfidf_vec, svd


def word_segment(sentence):
    words = jieba.cut(sentence)
    return ','.join(words).split(',')

stop_words = set()
def load_stopwords():
    """
    """
    with open('../middle_data/stopwords.txt', 'r', encoding='UTF-8') as f:
        for line in f.readlines():
            stop_words.add(line.strip())

load_stopwords()

def remove_stopwords(word_lists):
    """
    """
    res = []
    for word in word_lists:
        if word not in stop_words:
            res.append(word)
    return ' '.join(res)

def clean_text(string):
    return string.replace(' ', '').replace('\n', '').replace('\u3000', '')

## 电影

In [10]:
movie_types = set()
for movie in movies:
    for movie_type in movie['type']:
        movie_types.add(movie_type)

In [11]:
movie_features = []
movie_features.extend(list(movie_types))
hand_select_features = ['len_directors', 'len_writer', 'len_actor', 'len_title', 'len_summary']
movie_features.extend(hand_select_features)

In [12]:
from joblib import dump, load
dump(movie_features, '../middle_data/movie_features.sk.var')


['../middle_data/movie_features.sk.var']

In [13]:
df_movies = pd.DataFrame(np.zeros((len(movies), len(movie_features))), columns = movie_features)

In [14]:
cnt = 0
def set_features(x):
    global cnt
    movie = movies[cnt]
    cnt += 1
    x[movie['type']] = 1
    x['len_directors'] = len(movie['directors'])
    x['len_writer'] = len(movie['writer'])    
    x['len_actor'] = len(movie['actor'])    
    x['len_title'] = len(clean_text(movie['title']))
    x['len_summary'] = len(clean_text(movie['summary']))
_ = df_movies.apply(set_features, axis = 1)

In [15]:
df_movies[df_movies['科幻'] == 1.0]

Unnamed: 0,武侠,儿童,西部,爱情,同性,喜剧,犯罪,恐怖,奇幻,科幻,...,剧情,战争,黑色电影,历史,家庭,len_directors,len_writer,len_actor,len_title,len_summary
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,12.0,18.0,367.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,2.0,2.0,20.0,42.0,351.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,3.0,25.0,37.0,358.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,27.0,368.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,7.0,23.0,15.0,260.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2604,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,18.0,21.0,238.0
2616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,2.0,13.0,20.0,348.0
2632,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,15.0,29.0,373.0
2637,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,3.0,1.0,12.0,32.0,56.0


In [16]:
movie_texts = []
for movie in movies:
    text = clean_text(movie['title'] + movie['summary'])
    movie_texts.append(text)

In [17]:
movie_texts_tfidf_svd, tfidf_vec_movie, svd_movie = build_tfidf_svd_matrix(movie_texts, 64)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.857 seconds.
Prefix dict has been built succesfully.


In [18]:
from joblib import dump, load
dump(tfidf_vec_movie, '../middle_data/tfidf_vec_movie.sk.model')
dump(svd_movie, '../middle_data/svd_movie.sk.model')

['../middle_data/svd_movie.sk.model']

In [19]:
tf_idf_movie_columns_names = ['td_idf_movie_%d' % i for i in range(movie_texts_tfidf_svd.shape[1])]
df_tf_idf_movie_svd = pd.DataFrame(movie_texts_tfidf_svd, columns = tf_idf_movie_columns_names)

In [20]:
df_movies_with_tfidf = pd.concat([df_movies, df_tf_idf_movie_svd], axis=1)
df_movies_with_tfidf.head()

Unnamed: 0,武侠,儿童,西部,爱情,同性,喜剧,犯罪,恐怖,奇幻,科幻,...,td_idf_movie_54,td_idf_movie_55,td_idf_movie_56,td_idf_movie_57,td_idf_movie_58,td_idf_movie_59,td_idf_movie_60,td_idf_movie_61,td_idf_movie_62,td_idf_movie_63
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,-0.01753,0.041538,0.003458,-0.026284,0.01445,0.021619,0.041345,-0.051655,0.044011,-0.007605
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.011432,0.033202,-0.018139,-0.026845,0.008995,0.0185,0.032574,-0.021791,0.023361,-0.017838
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.039166,0.009165,0.049353,0.004273,-0.059249,-0.008669,0.037422,-0.023646,-0.013603,0.019573
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,-0.000834,-0.005642,0.051554,-0.097415,-0.032493,-0.016136,0.043008,-0.024096,-0.02127,0.001998
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.019182,0.08597,0.009967,-0.092546,-0.057068,-0.058371,0.014235,0.011969,0.047687,-0.050658


### 是否归一化

In [21]:
X_mean = df_movies_with_tfidf[hand_select_features].mean()
std = df_movies_with_tfidf[hand_select_features].std()

In [22]:
df_movies_all_normal = df_movies_with_tfidf.copy()
df_movies_all_normal[hand_select_features] = (df_movies_all_normal[hand_select_features] - X_mean) / std

In [23]:
mp_movie_url_vector = {}
for i in range(len(movies)):
    mp_movie_url_vector[movies[i]['movie_url']] = np.array(df_movies_with_tfidf.iloc[i])

## 评论

In [24]:
comments[0]

{'user_url': 'https://www.douban.com/people/questwoo/',
 'comment_text': '我能体会到波特曼看到雷神和洛基生离死别时那种电灯泡的感觉',
 'votes': '1866',
 'star': '4',
 'comment_time': '2013-11-10 05:17:36',
 'watch_type': '看过',
 'rank': 1,
 'comment_id': 1,
 'movie_url': 'https://movie.douban.com/subject/6560058/',
 'entities': ['波特曼 PER', '雷神 PER', '洛基 PER']}

In [25]:
comment_texts = []
for comment in comments:
    comment_texts.append(comment['comment_text'])

### 用户画像（TODO）

In [31]:
mp_user_comment_num = {}
for comment in comments:
    if comment['user_url'] in mp_user_comment_num:
        mp_user_comment_num[comment['user_url']] += 1
    else:
        mp_user_comment_num[comment['user_url']] = 1


In [32]:
len(mp_user_comment_num), len(comments)

(133868, 1102111)

### tfidf svd

In [26]:
start = time.time()
comment_texts_tfidf_svd, tfidf_vec_comment, svd_comment = build_tfidf_svd_matrix(comment_texts, 16)
print(time.time() - start)

333.477037191391


In [27]:
from joblib import dump, load
dump(tfidf_vec_comment, '../middle_data/tfidf_vec_comment.sk.model')
dump(svd_comment, '../middle_data/svd_comment.sk.model')


['../middle_data/svd_comment.sk.model']

In [30]:
mp_comment_id_tfidf = {}
for i in range(comment_texts_tfidf_svd.shape[0]):
    mp_comment_id_tfidf[comments[i]['comment_id']] = comment_texts_tfidf_svd[i]

### 评论长度

In [32]:
comment_len_V = np.zeros((len(comments), 1))
for i, comment in enumerate(comments):
    comment_len_V[i,0] = len(comment['comment_text'])

In [33]:
mp_comment_len = {}
for i in range(comment_len_V.shape[0]):
    mp_comment_len[comments[i]['comment_id']] = comment_len_V[i]

# 合并电影和评论

In [34]:
samples[0]

{'movie': {'score': '7.4',
  'title': '\n雷神2：黑暗世界 Thor: The Dark World\n(2013)\n',
  'summary': '纽约大战后，雷神索尔（克里斯·海姆斯沃斯 Chris Hemsworth 饰）将弟弟洛基（汤姆·希德勒斯顿 Tom Hiddleston 饰）带回仙宫囚禁起来，此外帮助九大国度平定纷争，威名扶摇直上。虽然父王奥丁（安东尼·霍普金斯 Anthony Hopkins 饰）劝其及早即位，但索尔念念不忘地球的美丽女孩简·福斯特（娜塔丽·波特曼 Natalie Portman 饰）。与此同时，简在和黛西及其助手伊安调查某个区域时意外被神秘物质入侵，却也因此重逢索尔，并随其返回仙宫。令人意想不到的是，藏在简体内的物质来自远古的黑暗精灵玛勒基斯（克里斯托弗·埃克莱斯顿 Christopher Eccleston 饰）。在“天体汇聚”的时刻再次到来之际，玛勒基斯企图摧毁九大国度，缔造一个全然黑暗的宇宙。\n                                        \n                                    \u3000\u3000藏匿简的仙宫受到重创，而索尔和洛基这对冤家兄弟也不得不联手迎战...',
  'directors': ['阿兰·泰勒'],
  'writer': ['克里斯托弗·约斯特',
   '克里斯托弗·马库斯',
   '斯蒂芬·麦克菲利',
   '唐·佩恩',
   '罗伯特·罗达特',
   '斯坦·李',
   '拉里·利伯',
   '杰克·科比',
   '沃尔特·西蒙森'],
  'actor': ['克里斯·海姆斯沃斯',
   '娜塔莉·波特曼',
   '汤姆·希德勒斯顿',
   '安东尼·霍普金斯',
   '克里斯托弗·埃克莱斯顿',
   '杰米·亚历山大',
   '扎克瑞·莱维',
   '雷·史蒂文森',
   '浅野忠信',
   '伊德里斯·艾尔巴',
   '蕾妮·罗素',
   '阿德沃尔·阿吉纽依-艾格拜吉',
   '凯特·戴琳斯',
   '斯特兰·斯卡斯加德',
   '艾丽丝·克里奇',
   '克里夫·罗素',
   '乔纳森·霍华德',
   '克里斯·奥多德'

## 不带实体重叠特征抽取函数

In [38]:
def get_common_substrs(str1, str2, min_len):
    """
    在str1中寻找出现在str2的子串
    """
    substr_list = []  # 保存两者相同的子串
    s = 0  # 记录str1起始位置
    e = 1  # 记录str1终止位置
    match_num = 0  # 匹配个数
    is_final = False  # 是否结束匹配过程：终止位置到达str1的最后一个位置
    while not is_final:
        cur_str = str1[s:e]
        if cur_str in str2:
            match_num += 1
            if e == len(str1):
                if len(cur_str) >= min_len:
                    substr_list.append(str1[s:s + match_num])
                is_final = True
            else:
                e += 1
        else:
            if match_num < min_len:
                s += 1
                if e != len(str1):
                    e += 1
            else:
                substr_list.append(str1[s:s + match_num])
                s = s + match_num
                e = s + 1
                match_num = 0
    return substr_list

def get_feature_movie_comment(movie, comment):
    """
    Args:
        movie:
        comment:
    Returns:
        vector:
        第一维：命中电影summary长度为2的数量累积加权
        第二维：命中电影summary长度大于2的数量累积加权
        第三维：命中导演数量
        第四维：命中编剧数量
        第五维：命中演员数量
    """
    comment_text = re.sub('，|“|”|、|；|、|。|…|\.|的|时候|个|·|）|（| ', '', comment['comment_text'])
    len_comment = len(comment_text)
    len_movie_summary = len(movie['summary'])
    vector = np.zeros((5,))
    if(len_comment == 0):
        return vector
    
    substr_list = get_common_substrs(comment_text, movie['summary'], 2)
    for i, sub_srt in enumerate(substr_list):
        len_sub_str = len(sub_srt)
        if(len_sub_str == 2):
            vector[0] += 2
        elif(len_sub_str > 2):
            vector[1] += len_sub_str
    for director in movie['directors']:
        substr = get_common_substrs(director, comment_text, 2) 
        if(len(substr) > 0):
            vector[2] += 1
    
    
    for writer in movie['writer']:
        substr = get_common_substrs(writer, comment_text, 2) 
        if(len(substr) > 0):
            vector[3] += 1

    for actor in movie['actor']:
        substr = get_common_substrs(actor, comment_text, 2) 
        if(len(substr) > 0):
            vector[4] += 1
    
    return vector

## 带实体重叠特征抽取函数

In [52]:
def get_feature_movie_entity(movie, comment):
    """
    """
    vector = np.zeros((4, ))
    movie_info_text = movie['title'] + movie['summary'] + ''.join(movie['directors']) + ''.join(movie['actor']) + ''.join(movie['writer'])
    for item in comment['entities']:
        entity, entity_type = item.split(' ')
        if(entity in movie_info_text):
            vector[{'PER':0, 'LOC':1, 'ORG':2}[entity_type]] += len(entity)
    vector[3] = len(comment['entities'])
    return vector

## 原始特征拼接

In [47]:
movie_feature_len = list(mp_movie_url_vector.values())[0].shape[0]
comment_feature_len = list(mp_comment_id_tfidf.values())[0].shape[0]

In [48]:
X = np.zeros((len(samples), movie_feature_len + 2 * comment_feature_len + 2))
y = np.zeros((len(samples), ))
for i, sample in enumerate(samples):
    y[i] = random.randint(0, 1)
    X[i][:movie_feature_len] = mp_movie_url_vector[sample['movie']['movie_url']]    
    if(y[i] == 1):
        X[i][movie_feature_len : movie_feature_len+comment_feature_len] = mp_comment_id_tfidf[sample['right_comment']['comment_id']]
        X[i][movie_feature_len+comment_feature_len : -2] = mp_comment_id_tfidf[sample['wrong_comment']['comment_id']]
        X[i][-2] = mp_comment_len[sample['right_comment']['comment_id']]
        X[i][-1] = mp_comment_len[sample['wrong_comment']['comment_id']]
    else:
        X[i][movie_feature_len : movie_feature_len+comment_feature_len] = mp_comment_id_tfidf[sample['wrong_comment']['comment_id']]
        X[i][movie_feature_len+comment_feature_len : -2] = mp_comment_id_tfidf[sample['right_comment']['comment_id']]   
        X[i][-2] = mp_comment_len[sample['wrong_comment']['comment_id']]
        X[i][-1] = mp_comment_len[sample['right_comment']['comment_id']]

## 不带实体，电影 评论重叠信息特征

In [49]:
start = time.time()
cnt = 0

hit_feature_number = 5
X_hit_features = np.zeros((len(samples), hit_feature_number * 2))
for i, sample in enumerate(samples):
    v_right = get_feature_movie_comment(sample['movie'], sample['right_comment'])
    v_wrong = get_feature_movie_comment(sample['movie'], sample['wrong_comment'])
    if(y[i] == 1):
        X_hit_features[i][:hit_feature_number] = v_right
        X_hit_features[i][hit_feature_number:] = v_wrong
    else:
        X_hit_features[i][:hit_feature_number] = v_wrong
        X_hit_features[i][hit_feature_number:] = v_right
    
    if(cnt % 100000 == 0):
        print(cnt, time.time() - start)
    cnt += 1

0 0.001462697982788086
100000 22.28075861930847
200000 41.32304406166077
300000 60.56052017211914
400000 79.5658507347107
500000 99.0408239364624
600000 116.77722835540771
700000 136.76764059066772
800000 154.00202441215515
900000 174.89814472198486
1000000 193.63973832130432
1100000 209.95308136940002
1200000 228.68139696121216
1300000 245.39527702331543
1400000 261.04314708709717
1500000 277.9171690940857
1600000 298.94186186790466
1700000 315.6571078300476
1800000 332.50941705703735
1900000 348.54197096824646
2000000 362.30759596824646
2100000 378.5216863155365
2200000 395.3744332790375
2300000 409.1595256328583
2400000 423.6898171901703
2500000 439.22526502609253
2600000 456.29476618766785
2700000 474.1492028236389
2800000 488.7256762981415


## 实体特征

In [54]:
start = time.time()
cnt = 0

entity_feature_number = 4
X_entity_feature = np.zeros((len(samples), entity_feature_number * 2))
for i, sample in enumerate(samples):
    v_right = get_feature_movie_entity(sample['movie'], sample['right_comment'])
    v_wrong = get_feature_movie_entity(sample['movie'], sample['wrong_comment'])
    if(y[i] == 1):
        X_entity_feature[i][:entity_feature_number] = v_right
        X_entity_feature[i][entity_feature_number:] = v_wrong
    else:
        X_entity_feature[i][:entity_feature_number] = v_wrong
        X_entity_feature[i][entity_feature_number:] = v_right
    
    if(cnt % 100000 == 0):
        print(cnt, time.time() - start)
    cnt += 1

0 0.0013370513916015625
100000 1.070570468902588
200000 2.006481409072876
300000 2.9521169662475586
400000 3.9089112281799316
500000 4.853625774383545
600000 5.878629207611084
700000 7.022550106048584
800000 8.112439155578613
900000 9.233209371566772
1000000 10.368393898010254
1100000 11.515652894973755
1200000 12.627682209014893
1300000 13.656734466552734
1400000 14.59129524230957
1500000 15.540847301483154
1600000 16.619782209396362
1700000 17.604134798049927
1800000 18.55076837539673
1900000 19.530085563659668
2000000 20.47172713279724
2100000 21.444985389709473
2200000 22.397820234298706
2300000 23.478790998458862
2400000 24.374224424362183
2500000 25.297502517700195
2600000 26.40814471244812
2700000 27.474499940872192
2800000 28.60411763191223


## 合并

In [55]:
# X_all = np.concatenate((X, X_hit_features), axis=1)
X_all = np.concatenate((X, X_hit_features, X_entity_feature), axis=1)


In [56]:
X.shape, y.shape

((2842112, 133), (2842112,))

In [57]:
X_all.shape

(2842112, 151)

# Modeling

## 评估函数

In [58]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from IPython.display import display, HTML
from sklearn.metrics import confusion_matrix

def evaluation(score, ground_truth, threshold):
    """
    print the evaluation results for binary classification
    Args:
        score: the predict results
        ground_truth: the truth label
        threshold: when score is lager than threshold will be considered as a positive sample
    """
    print('准确率：%f' % accuracy_score(ground_truth, score>0.5))
    print('AUC: %f' % roc_auc_score(ground_truth, score))
    print('F1: %f' % f1_score(ground_truth, score > threshold, average='macro'))
    print('混淆矩阵：')
    df_confusion_matrix = pd.DataFrame(confusion_matrix(ground_truth, score > threshold))
    display(HTML(df_confusion_matrix.to_html()))
    
    print('准召报告')
    df_precision_recall_fscore = pd.DataFrame(precision_recall_fscore_support(ground_truth, score > threshold), index = ['precision', 'recall', 'f1', 'number'])
    display(HTML(df_precision_recall_fscore.to_html()))

## 数据划分

In [45]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [46]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1988761, 131), (852327, 131), (1988761,), (852327,))

In [60]:
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y, test_size=0.3, random_state=42)

In [70]:
train_len = 1988761
X_train_all, X_test_all, y_train_all, y_test_all = X_all[:train_len], X_all[train_len:], y[:train_len], y[train_len:]

## LR

In [48]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0).fit(X_train, y_train)



In [49]:
test_score = clf.predict_proba(X_test)[:, 1]

In [50]:
evaluation(test_score, y_test, 0.5)

准确率：0.568420
AUC: 0.589034
F1: 0.568404
混淆矩阵：


Unnamed: 0,0,1
0,244837,181964
1,185883,239643


准召报告


Unnamed: 0,0,1
precision,0.568437,0.568404
recall,0.573656,0.563169
f1,0.571034,0.565774
number,426801.0,425526.0


In [51]:
np.argsort(clf.coef_)

array([[104, 102, 112, 130, 103, 109, 122, 107, 115, 129, 121, 101, 100,
        110,  82,  69,  81, 108,  85,  60,  70,  84,  62,  17,  67,  90,
         53,  80,  63,  71, 111,  91,  47,   2,  54,  76,  83,  42,  56,
         48,  20,  88,  72,   3,  86,  44,  89,  51,   0, 127,  24,   9,
         73,  75,   5,  26,  92,  21,  96,  10,  14,  97,   6,  45,  18,
         22,  32,  34,  12,  33,  30,  25,   1,  31,  37,  77,  29,   8,
          4,   7,  41,  19,  39,  15,  13,  87,  27,  64,  50,  36,  23,
         11,  28,  16,  46,  74,  68,  94,  98,  61,  95,  57,  49,  65,
         43,  38,  93,  35,  78,  79,  55,  58,  40,  59,  52,  66, 124,
        126, 116, 117, 113, 105,  99, 125, 123, 106, 119, 114, 128, 118,
        120]])

## GBDT

In [71]:
import lightgbm as lgb
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.01,
    'max_depth': -1,
    'metric':'auc',
    'is_unbalance': True,
    'min_data_in_leaf': 160,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1,
    'device': 'cpu'
}
start = time.time()
trn_data = lgb.Dataset(X_train_all, label=y_train_all)
lgb_clf = lgb.train(param, trn_data, 200000, valid_sets = [trn_data], verbose_eval=5000, early_stopping_rounds=4000)
print(time.time() - start)

Training until validation scores don't improve for 4000 rounds
[5000]	training's auc: 0.904294
[10000]	training's auc: 0.923158
[15000]	training's auc: 0.937499
[20000]	training's auc: 0.949044
[25000]	training's auc: 0.958261
[30000]	training's auc: 0.965698
[35000]	training's auc: 0.97175
[40000]	training's auc: 0.97656
[45000]	training's auc: 0.980574
[50000]	training's auc: 0.983851
[55000]	training's auc: 0.986574
[60000]	training's auc: 0.988869
[65000]	training's auc: 0.990749
[70000]	training's auc: 0.992308
[75000]	training's auc: 0.993601
[80000]	training's auc: 0.994702
[85000]	training's auc: 0.995591
[90000]	training's auc: 0.996348
[95000]	training's auc: 0.996975
[100000]	training's auc: 0.997498
[105000]	training's auc: 0.997928
[110000]	training's auc: 0.998291
[115000]	training's auc: 0.998585
[120000]	training's auc: 0.998833
[125000]	training's auc: 0.999042
[130000]	training's auc: 0.999212
[135000]	training's auc: 0.999351
[140000]	training's auc: 0.999467
[145000

In [62]:
test_score_lgb = lgb_clf.predict(X_test_all)

In [64]:
evaluation(test_score_lgb, y_test_all, 0.5)

准确率：0.985670
AUC: 0.998740
F1: 0.985670
混淆矩阵：


Unnamed: 0,0,1
0,419773,6042
1,6176,420643


准召报告


Unnamed: 0,0,1
precision,0.985501,0.98584
recall,0.985811,0.98553
f1,0.985656,0.985685
number,425815.0,426819.0


In [65]:
lgb_clf.save_model('../middle_data/lgb_clf_with_entity.txt')

<lightgbm.basic.Booster at 0x7f4b743ab5f8>

In [57]:
X_test_all[0]

array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        1.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  1.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  1.00000000e+00,  2.00000000e+00,
        2.30000000e+01,  4.90000000e+01,  3.40000000e+02,  9.47419621e-02,
        2.77242859e-02, -3.44640431e-03, -2.17476860e-02, -3.65413345e-03,
        2.97852039e-03, -1.26442155e-02, -5.38940794e-03,  8.06771794e-03,
       -3.12733724e-02, -8.59134428e-03,  1.58928494e-02,  1.02797762e-02,
       -2.87237182e-03, -2.37320565e-02,  1.85256534e-02,  3.62213841e-02,
        3.31014362e-02, -