In [1]:
# 数值特征
import pandas as pd
import numpy as np
import json
from tqdm import tqdm as tqdm
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
import jieba
# 获取mention的tfidf
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
import gensim.downloader as api
from gensim.models import LsiModel

In [2]:
train_data = []
train_data_dict = {}
with open('data/raw_data/train.json', 'r') as f:
    for line in f:
        raw = eval(str(json.loads(line)).lower())
        train_data.append(raw)
        train_data_dict[raw['text_id']] = raw
# 读取知识库
kb_data = []
kb = {}
with open('data/raw_data/kb_data','r') as f:
    for line in f:
        item = eval(str(json.loads(line)).lower())
        kb[item['subject_id']] = item
        kb_data.append(item)


name_id = {}
for kb_id in kb:
    for item in kb[kb_id]['alias']:
        if item not in name_id:
            name_id[item] = [kb_id]
        else:
            name_id[item].append(kb_id)
    if kb[kb_id]['subject'] not in name_id:
        name_id[kb[kb_id]['subject']] = [kb_id]
    else:
        name_id[kb[kb_id]['subject']].append(kb_id)
for id in name_id:
    name_id[id] = sorted(list(set(name_id[id])))
for id in name_id:
    jieba.add_word(id,freq=10000)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.576 seconds.
Prefix dict has been built succesfully.


In [3]:
sentence = []
for item in tqdm(kb_data):
    sentence.append([item['subject_id'],item['type'][0]])
    for t in item['data']:
        temp = []
        if t['predicate'] == '摘要':
            abstract = jieba.lcut(t['object'])
            for word in abstract:
                if word !=item['subject']:
                    temp.append(word)
                else:
                    temp.append(item['subject_id'])
            sentence.append(temp)
        else:
            temp.append(item['subject_id'])
            temp.append(t['predicate'])
            for w in jieba.lcut(t['object']):
                temp.append(w)
            sentence.append(temp)
train_tf = []
with open('data/raw_data/train.json','r') as f:
    for line in f:
        sentence.append(jieba.lcut(eval(str(json.loads(line)).lower())['text']))
        train_tf.append(jieba.lcut(eval(str(json.loads(line)).lower())['text']))
with open('data/raw_data/eval722.json','r') as f:
    for line in f:
        sentence.append(jieba.lcut(eval(str(json.loads(line)).lower())['text']))
        train_tf.append(jieba.lcut(eval(str(json.loads(line)).lower())['text']))

100%|██████████| 399252/399252 [03:54<00:00, 1704.11it/s]


In [4]:
word_tf={}
for s in sentence:
    for w in s:
        if w not in word_tf:
            word_tf[w]=1
        else:
            word_tf[w]+=1

In [5]:
data = pd.read_pickle('data/step1.pkl').loc[:,['text_id','kb_id','label','m_id','train_mention']]
train_size = data.shape[0]

In [6]:
test_data = pd.read_pickle('data/step1_test.pkl').loc[:,['text_id','kb_id','m_id','train_mention']]

In [7]:
test_data['label'] = -1

In [8]:
data_all = data.append(test_data)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [9]:
data_all.shape

(2330897, 5)

In [10]:
# SVD降维后的TFIDF矩阵，n-gram 1-5

In [11]:
data_all['text'] = data_all['text_id'].apply(lambda x:train_data_dict[str(x)]['text'])

In [12]:
data_all.head()

Unnamed: 0,kb_id,label,m_id,text_id,train_mention,text
0,130287,0,0,1,南京南站,南京南站:坐高铁在南京南站下。南京南站
1,311223,1,0,1,南京南站,南京南站:坐高铁在南京南站下。南京南站
2,341096,1,1,1,高铁,南京南站:坐高铁在南京南站下。南京南站
3,130287,0,2,1,南京南站,南京南站:坐高铁在南京南站下。南京南站
4,311223,1,2,1,南京南站,南京南站:坐高铁在南京南站下。南京南站


In [13]:
group_f = data_all[['text_id','text']].drop_duplicates()
group_f['text_segment'] = group_f['text'].apply(lambda x:jieba.lcut(x))

In [14]:
dct = Dictionary(group_f['text_segment'])  # fit dictionary
corpus = [dct.doc2bow(line) for line in group_f['text_segment']]  # convert corpus to BoW format
model = TfidfModel(corpus)  # fit model

In [15]:
vector = model[corpus]
group_f['vector'] = vector

In [16]:
data_all = data_all.merge(group_f[['text_id','vector']],on='text_id')

In [17]:
def cal_tfidf(mention,vec):
    for v in vec:
        try:
            if dct.token2id[mention]==v[0]:
                return v[1]
        except:
            pass
            #print(mention)
            
    return -1
data_all['mention_tfidf'] = data_all.apply(lambda x:cal_tfidf(x.train_mention,x.vector),axis=1)
data_all['sum_tfidf'] = data_all['vector'].apply(lambda x:sum([i[1] for i in x]))

In [18]:
def cal_mention_tf(x):
    try:
        return word_tf[x]
    except:
        return -1

In [19]:
data_all['mention_tf'] = data_all['train_mention'].apply(lambda x:cal_mention_tf(x))

In [20]:
train_word_tf={}
for s in train_tf:
    for w in s:
        if w not in train_word_tf:
            train_word_tf[w]=1
        else:
            train_word_tf[w]+=1

In [21]:
def cal_train_mention_tf(x):
    try:
        return train_word_tf[x]
    except:
        return -1

In [22]:
data_all['train_mention_tf'] = data_all['train_mention'].apply(lambda x:cal_train_mention_tf(x))

In [23]:
data_group = data_all.groupby('train_mention')['label'].agg({'train_mention_count':'count'})
data_all = data_all.merge(data_group,on='train_mention',how='left')

is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.


In [24]:
data_all['ner_rate'] = data_all['train_mention_count']/data_all['train_mention_tf']

In [25]:
feature = ['mention_tfidf','sum_tfidf','mention_tf','train_mention_tf','train_mention_count','ner_rate']

In [26]:
train_data= data_all.iloc[0:train_size,:].reset_index(drop=True)
test_data= data_all.iloc[train_size:,:].reset_index(drop=True)

In [27]:
train_data.loc[:,feature].to_pickle('data/step3.pkl')
test_data.loc[:,feature].to_pickle('data/step3_test.pkl')

In [28]:
print(train_data.shape,test_data.shape)

(1566680, 13) (764217, 13)


In [29]:
# 根据kb提取topic，赋值给data_all #########################################

In [30]:
# lsi_model.save('data/lsi_model.pth')  # save model
#loaded_model = LsiModel.load(tmp_fname)  #

In [31]:
# # 应用于text
# dct = Dictionary(group_f['text_segment'])  # fit dictionary
# corpus = [dct.doc2bow(line) for line in group_f['text_segment']]

In [32]:
# 最大topic是否吻合，topic点积，最大topic，topic

In [33]:
# vectorized_corpus = model[corpus]

In [34]:
# topic_name = ['topics_%d'%i for i in range(50)]

In [35]:
# # vectorized = []
# for x in vectorized_corpus:
#     topic_arr = np.zeros(50)
#     for tuple_x in x:
#         #print(tuple_x)
#         topic_arr[tuple_x[0]] = tuple_x[1]
#     vectorized.append(topic_arr)

In [36]:
# vec_df = pd.DataFrame(vectorized,columns=topic_name)

In [37]:
# vec_df.head()

In [38]:
# vec_df['max_topic'] = vec_df.idxmax(axis=1).apply(lambda x:int(x.split('_')[-1]))