In [None]:
import pandas as pd
import numpy as np
import json,os, random
from  lightgbm import LGBMClassifier,log_evaluation,early_stopping
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from bert4torch.tokenizers import Tokenizer, load_vocab

import warnings
warnings.filterwarnings("ignore")

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)
    # torch.backends.cudnn.deterministic = True
seed_everything()

In [None]:
# 作者ID:{作者姓名：自己的论文（ID形式），错误的论文}
with open("../raw_data/train_author.json") as f:
    train_author=json.load(f)

# 论文ID : titile, 作者姓名，作者单位，期刊，出版年，关键字，摘要
with open("../raw_data/pid_to_info_all.json", encoding="utf8") as f:
    pid_to_info=json.load(f)
    
# 作者ID:{作者姓名：所有论文}
with open("../raw_data/ind_valid_author.json") as f:
    valid_author=json.load(f)
    
with open("../raw_data/ind_valid_author_submit.json") as f:
    submission=json.load(f)

# 基本特征采集

In [None]:
train_feats=[]
labels=[]

for e1, (id, person_info) in enumerate(train_author.items()):
    for e2, text_id in enumerate(person_info['normal_data']): # 正样本
        feat=pid_to_info[text_id]
        # if e1 ==0 and e2 == 1:
        #     print(feat)
        # titile, [作者姓名,作者单位]，摘要, 关键字, 期刊，出版年
        # 作者信息这里，数量不一致没法使用
        try:
            if feat["venue"] == None and feat['year'] == "":
                train_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), 0, 2000,
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])

            elif feat["venue"] != None and feat['year'] == "":
                train_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), len(feat["venue"]), 2000,
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])

            elif feat["venue"] == None and feat['year'] != "":
                train_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), 0, int(feat['year']),
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])
            else:
                train_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), len(feat["venue"]), int(feat['year']),
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])
        except:
            # pass
            print(feat)
        labels.append(1)
        
        
    for text_id in person_info['outliers']:#负样本
        feat=pid_to_info[text_id]
        try:
            if feat["venue"] == None and feat['year'] == "":
                train_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), 0, 2000,
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])

            elif feat["venue"] != None and feat['year'] == "":
                train_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), len(feat["venue"]), 2000,
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])

            elif feat["venue"] == None and feat['year'] != "":
                train_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), 0, int(feat['year']),
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])
            else:
                train_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), len(feat["venue"]), int(feat['year']),
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])
        except:
            # pass
            print(feat)
        labels.append(0)   

valid_feats=[]
for id,person_info in valid_author.items():
    for text_id in person_info['papers']:
        feat=pid_to_info[text_id]
        try:
            if feat["venue"] == None and feat['year'] == "":
                valid_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), 0, 2000,
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])

            elif feat["venue"] != None and feat['year'] == "":
                valid_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), len(feat["venue"]), 2000,
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])

            elif feat["venue"] == None and feat['year'] != "":
                valid_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), 0, int(feat['year']),
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])
            else:
                valid_feats.append([len(feat['title']), len(feat['abstract']), len(feat['keywords']), 
                     len(feat['authors']), len(feat["venue"]), int(feat['year']),
                     str(feat['title']),str(feat['authors']), str(feat['abstract']), feat['keywords'], str(feat["venue"])])
        except:
            # pass
            print(feat)

In [None]:
train_feats=pd.DataFrame(train_feats)
train_feats.columns = ["0", "1", "2", "3", "4", "5", "title","authors", "abstract", "keywords", "venue"]
train_feats["label"] = labels
valid_feats = pd.DataFrame(valid_feats)
valid_feats.columns = ["0", "1", "2", "3", "4", "5", "title","authors","abstract", "keywords", "venue"]

print(f"train_feats.shape:{train_feats.shape},labels.shape:{np.array(labels).shape}")
print(f"valid_feats.shape:{valid_feats.shape}")

In [None]:
def replace_empty_list_with_nan(value):
    if isinstance(value, list) and not value:
        return np.nan
    else:
        return value
    
train_feats = train_feats.applymap(replace_empty_list_with_nan)
train_feats.replace("", np.nan, inplace=True)
train_feats = train_feats.fillna("0")
train_feats["keywords"] = train_feats["keywords"].apply(lambda x: ','.join(x) if isinstance(x, list) else x)

valid_feats = valid_feats.applymap(replace_empty_list_with_nan)
valid_feats.replace("", np.nan, inplace=True)
valid_feats = valid_feats.fillna("0")
valid_feats["keywords"] = valid_feats["keywords"].apply(lambda x: ','.join(x) if isinstance(x, list) else x)

# 特征工程

## 构造文字向量特征

### TFIDF

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from scipy import sparse

In [None]:
col_list = ["title", "abstract", "keywords", "venue"]
for e1, col in enumerate(col_list):
    if col == "title":
        tfidfvec = TfidfVectorizer(
    # stop_words=ENGLISH_STOP_WORDS,
                          ngram_range=(1,1),
                          max_features=5)
        
    elif col == "abstract":
        tfidfvec = TfidfVectorizer(
    # stop_words=ENGLISH_STOP_WORDS,
                          ngram_range=(1,1),
                          max_features=10)
        
    elif col == "keywords":
        tfidfvec = TfidfVectorizer(
    # stop_words=ENGLISH_STOP_WORDS,
                          ngram_range=(1,1),
                          max_features=5)
        
    elif col == "venue":
        tfidfvec = TfidfVectorizer(
    # stop_words=ENGLISH_STOP_WORDS,
                          ngram_range=(1,1),
                          max_features=3)
        
    tfidfvec.fit(train_feats[col])
    new_feats = tfidfvec.transform(train_feats[col])
    if e1 == 0:
        new_feats_ = new_feats
    else:
        new_feats_ = sparse.hstack((new_feats_, new_feats))
    
train_tfidf = pd.DataFrame(new_feats_.toarray())
train_tfidf.columns = ["tfidf_"+str(i)for i in train_tfidf.columns]
train_feats = pd.concat([train_feats, train_tfidf],axis=1)



for e1, col in enumerate(col_list):
    if col == "title":
        tfidfvec = TfidfVectorizer(
    # stop_words=ENGLISH_STOP_WORDS,
                          ngram_range=(1,1),
                          max_features=5)
        
    elif col == "abstract":
        tfidfvec = TfidfVectorizer(
    # stop_words=ENGLISH_STOP_WORDS,
                          ngram_range=(1,1),
                          max_features=10)
        
    elif col == "keywords":
        tfidfvec = TfidfVectorizer(
    # stop_words=ENGLISH_STOP_WORDS,
                          ngram_range=(1,1),
                          max_features=5)
        
    elif col == "venue":
        tfidfvec = TfidfVectorizer(
    # stop_words=ENGLISH_STOP_WORDS,
                          ngram_range=(1,1),
                          max_features=3)
        
    tfidfvec.fit(train_feats[col])
    new_feats = tfidfvec.transform(valid_feats[col])
    if e1 == 0:
        new_feats_ = new_feats
    else:
        new_feats_ = sparse.hstack((new_feats_, new_feats))
        
valid_tfidf = pd.DataFrame(new_feats_.toarray())
valid_tfidf.columns = ["tfidf_"+str(i)for i in valid_tfidf.columns]
valid_feats = pd.concat([valid_feats, valid_tfidf],axis=1)
    
print(f"train_feats.shape:{train_feats.shape}")
print(f"valid_feats.shape:{valid_feats.shape}")

### 嵌入特征
- 这个方法会为每个单词创建一个向量，所以针对一句话要么将全部单词拼接，要么求平均。但是计算速度很慢

In [None]:
import gensim

In [None]:
model = gensim.models.Word2Vec(
    train_feats["abstract"].apply(lambda x:x.split(" ")),
    window=5,
    min_count=5,
    workers=4
)

def mean_w2v(x, model, size=5):
    i=0
    for word in x.split(" "):
        if word in model.wv.vocab:
            i+=1
            if i==1:
                vec = np.zeros(szie=5)
            vec += model.wv[word]
    return vec / i
    
def get_mean_w2v(df_data, columns, model, size):
    data_array = []
    for index, row in df_data.iterrows():
        w2v = mean_w2v(row[columns], model, size)
        data_array.append(w2v)
    return pd.DataFrame(data_array)

df_embeeding  = get_mean_w2v(train_feats, "abstract", model, 5)

### transformer特征

In [None]:
pretrained_dir = '../user_data/chinese-bert-wwm-ext/'
config_path = pretrained_dir+'config.json'
dict_path = pretrained_dir+'vocab.txt'
tokenizer = Tokenizer(dict_path, do_lower_case=True)

tokenizer_train = []
for e1, text in enumerate(train_feats["abstract"].values):
    token_ids,_ = tokenizer.encode(text, maxlen=10)
    tokenizer_train.append(token_ids)
tokenizer_traindf = pd.DataFrame(tokenizer_train)
tokenizer_traindf.columns = ["tokenizer_"+str(i)for i in tokenizer_traindf.columns]
train_feats = pd.concat([train_feats, tokenizer_traindf],axis=1)
print(f"train_feats.shape:{train_feats.shape}")



tokenizer_val = []
for e1, text in enumerate(valid_feats["abstract"].values):
    token_ids,_ = tokenizer.encode(text, maxlen=10)
    tokenizer_val.append(token_ids)
    
tokenizer_valdf = pd.DataFrame(tokenizer_val)
tokenizer_valdf.columns = ["tokenizer_"+str(i)for i in tokenizer_valdf.columns]
valid_feats = pd.concat([valid_feats, tokenizer_valdf],axis=1)
print(f"valid_feats.shape:{valid_feats.shape}")

### CountVectorizer特征
- 这个特征和TFIDF差不多

In [None]:
# data["name_list"] = data["name_list"].apply(lambda x : x.replace('[', '').replace(']', '').replace(',', ' '))
# data["org_list"] = data["org_list"].apply(lambda x : x.replace('[', '').replace(']', '').replace(',', ' '))

# col_list = ["name_list", "org_list"]
# max_features = [3, 3]
# new_feats_ = pd.DataFrame()
# for e1, col in enumerate(col_list):
#     count_vectorizer = CountVectorizer(max_features=max_features[e1] ,token_pattern=r'\b\d+\b')
#     count_matrix = pd.DataFrame(count_vectorizer.fit_transform(data[col]).toarray())
#     new_feats_ = pd.concat([new_feats_, count_matrix], axis=1)
# new_feats_.columns = ["CountVectorizer_"+ str(i) for i in range(sum(max_features))]

# data = pd.concat([data, new_feats_], axis=1)
# print(data.shape)

## 计算本论文和同作者其他论文的文本相似度
### Jaccard相似性、编辑距离、余弦相似度
- 编辑距离计算很慢

In [None]:
import nltk
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download('punkt')  # 下载必要的分词模型

In [None]:
train_dict = {}
all_train_num = 0
for e1, (id, person_info) in enumerate(train_author.items()):
    train_dict[id] = []
    train_dict[id].append(len(person_info['normal_data']) + len(person_info['outliers']))
    all_train_num += len(person_info['normal_data'])
    all_train_num += len(person_info['outliers'])
    
val_dict = {}   
all_val_num = 0
for id,person_info in valid_author.items():
    val_dict[id] = []
    val_dict[id].append(len(person_info['papers']))
    all_val_num += len(person_info['papers'])

In [None]:
Jaccard_Similarity_all = []
Cosine_Similarity = []

for i in range(len(train_data_title)):  #  779个人
    print(i)
    vectorizer = TfidfVectorizer(max_df=0.95, max_features=30)
    tfidf_matrix = vectorizer.fit_transform(list(train_data_title[i]))
    cosine_similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    Cosine_Similarity.append(cosine_similarity_matrix)
    
    jaccard_similarity = []
    for e1, j in enumerate(train_data_title[i]):  # 每个人的文章
        words1 = set(nltk.word_tokenize(j))
        js_value = []
        for text1 in train_data_title[i]:
            words2 = set(nltk.word_tokenize(text1))
            js = 1 - jaccard_distance(words1, words2)
            js_value.append(js)
        jaccard_similarity.append(js_value)
    Jaccard_Similarity_all.append(jaccard_similarity)

In [None]:
df_js = []
for i in range(len(train_data_title)):
    for j in range(len(Jaccard_Similarity_all[i])):
        df_js.append(pd.Series(Jaccard_Similarity_all[i][j]).mean())

df_cs = []
for i in range(len(train_data_title)):
    for j in range(len(Cosine_Similarity[i])):
        df_cs.append(pd.Series(Cosine_Similarity[i][j]).mean())
        
        
yhl = pd.DataFrame([df_js, df_cs]).T
display(yhl.head())
yhl.to_csv("../user_data/title_data.csv", index=0)

In [None]:
def process_row(row, new_data):
    if pd.notna(row['title']):  # 如果 'col1' 列的值不是 NaN
        return new_data.pop(0)  # 从 new_values 列表中弹出第一个元素并返回, 只能运行一次
    else:
        return np.nan  # 如果 'col1' 列的值是 NaN，则在新列中填充 NaN

train_data['title_js'] = train_data.apply(process_row, axis=1, args=(df_js,))
train_data['title_cs'] = train_data.apply(process_row, axis=1, args=(df_cs,))

## 暴力特征

In [None]:
print(data.shape)
feat_col = ['len(title)', 'len(abstract)', 'len(keywords)', 'len(authors)', 'len(venue)'] + [
    "title_js", "title_cs", "abs_js", "abs_cs", "key_js", "key_cs", "venu_js"]+["year"]
    
for i in feat_col:
    data[i+"_cos"] = np.cos(data[i])
    data[i+"_sin"] = np.sin(data[i])
    data[i+"_tan"] = np.tanh(data[i])
    for j in feat_col: 
        if i != j:
            data[i+"*"+j] = data[i] * data[j]
            data[i+"+"+j] = data[i] + data[j]
            data[i+"-"+j] = data[i] - data[j]
print(data.shape)


imputer = SimpleImputer(strategy='mean')
new_data_featcol = imputer.fit_transform(data[feat_col])
new_data_featcol = pd.DataFrame(PolynomialFeatures(degree=3).fit_transform(new_data_featcol))
new_data_featcol.columns = ["Poly" + str(i) for i in range(new_data_featcol.shape[1])]
print(new_data_featcol.shape)

data = pd.concat([data, new_data_featcol], axis=1)
print(data.shape)

with open("../raw_data/train_author.json") as f:
    train_author=json.load(f)
    
with open("../raw_data/ind_valid_author_submit.json") as f:
    submission=json.load(f)
    
    
labels=[]
for e1, (id, person_info) in enumerate(train_author.items()):
    for e2, text_id in enumerate(person_info['normal_data']): # 正样本
        labels.append(1)
    for text_id in person_info['outliers']:#负样本
        labels.append(0)  
        
print(pd.Series(labels).value_counts(),"\n",
      pd.Series(labels).value_counts()[0] / pd.Series(labels).value_counts()[1])

## groupby特征

In [None]:
for col in feat_col+["org_number"]:
    for m in ['count', 'sum', "max", "min", "mean", "std", "median","var"]:
        tmp = data.groupby(['year', "venue"])[col].agg(m).to_frame(name=f'{col}_venue_year_{m}').reset_index()
        data = data.merge(tmp, on=['year', "venue"], how='left')
        
for col in feat_col+["org_number"]:
    for m in ['count', 'sum', "max", "min", "mean", "std", "median","var"]:
        tmp = data.groupby(['auth_id', "venue"])[col].agg(m).to_frame(name=f'{col}_auth_id_venue_{m}').reset_index()
        data = data.merge(tmp, on=['auth_id', "venue"], how='left')
        
for col in ["year"]:
    for m in ["max", "min", "mean", "std", "median","var"]:
        tmp = data.groupby(["venue"])[col].agg(m).to_frame(name=f'{col}_venueyear_{m}').reset_index()
        data = data.merge(tmp, on=["venue"], how='left')
        
print(data.shape)