In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os
import csv
import random
import time

In [None]:
'''
功能: 合并两个dict
input: 
    dict1
    dict2
output: 
    合并后dict
'''
def dict_union(d1, d2):
    keys = d1.keys() | d2.keys()
    temp = {}
    for key in keys:
        temp[key] = sum([d.get(key,0) for d in (d1, d2)])
    return temp


'''
功能：统计dataframe信息
input: 
    dataframe
output: 
    dict{'U_id': value_counts.to_dict(), 'I_id': value_counts.to_dict(), 'label': value_counts.to_dict(), 'interactions': #interactions}
'''
def count_data_info(df):
    statistic_dict = {}
    for i in ['U_id', 'I_id', 'label']:
        data_feature_num = df[i].value_counts(normalize = False, dropna = True).to_dict()        
        statistic_dict[i] = data_feature_num
    statistic_dict['interactions'] = np.sum(np.asarray(np.array(list(statistic_dict['U_id'].items()))[:,1], dtype = int))
    return statistic_dict


'''
功能: 筛选warm users warm items / warm users cold items / cold users warm items / cold users cold items
input: 
    被筛选dataframe
    train dataframe的统计信息count_data_info(train_df)
output: 
    筛选后只包含warm users warm items / warm users cold items / cold users warm items / cold users cold items的dataframe
'''
def warm_user_warm_item(df,train_statistic_dict):
    df_out = df[(df['U_id'].isin(train_statistic_dict['U_id'])) & (df['I_id'].isin(train_statistic_dict['I_id']))].copy()
    df_out.reset_index(drop = True,inplace = True)
    return df_out

def warm_user_cold_item(df,train_statistic_dict):
    df_out = df[(df['U_id'].isin(train_statistic_dict['U_id'])) & (~df['I_id'].isin(train_statistic_dict['I_id']))].copy()
    df_out.reset_index(drop = True,inplace = True)
    return df_out

def cold_user_warm_item(df,train_statistic_dict):
    df_out = df[(~df['U_id'].isin(train_statistic_dict['U_id'])) & (df['I_id'].isin(train_statistic_dict['I_id']))].copy()
    df_out.reset_index(drop = True,inplace = True)
    return df_out

def cold_user_cold_item(df,train_statistic_dict):
    df_out = df[(~df['U_id'].isin(train_statistic_dict['U_id'])) & (~df['I_id'].isin(train_statistic_dict['I_id']))].copy()
    df_out.reset_index(drop = True,inplace = True)
    return df_out


'''
功能: 读取/生成并保存 ui_to_id_dict, id_to_ui_dict
input: 
    not_sparse_dict {'U_id': set(Uid), 'I_id': set(Iid)}
    target_path 读取/保存 路径
output: 
    ui_to_id_dict {'U_id':{Uid:mapped_Uid}, 'I_id':{Iid:mapped_Iid}}
    id_to_ui_dict {'U_id':{mapped_Uid:Uid}, 'I_id':{mapped_Iid:Iid}}
'''
def save_ui_dict(not_sparse_dict,target_path):
    if os.path.exists(target_path + '/ui_to_id.dict') and os.path.exists(target_path + '/id_to_ui.dict'):
        print('loading from file')
        with open(target_path + '/ui_to_id.dict','rb') as f:
            ui_to_id_dict = pickle.load(f)
        with open(target_path + '/id_to_ui.dict','rb') as f:
            id_to_ui_dict = pickle.load(f)
    else:
        ui_to_id_dict = {'U_id':{},'I_id':{}}

        for user in list(not_sparse_dict['U_id']):
            if user not in ui_to_id_dict['U_id'].keys():
                ui_to_id_dict['U_id'][user] = len(ui_to_id_dict['U_id'])

        for item in list(not_sparse_dict['I_id']):
            if item not in ui_to_id_dict['I_id'].keys():
                ui_to_id_dict['I_id'][item] = len(ui_to_id_dict['I_id'])

        dict_file = open(target_path + '/ui_to_id.dict', 'wb') 
        pickle.dump(ui_to_id_dict,dict_file)
        dict_file.close()

        id_to_ui_dict = {'U_id':{v:k for k,v in ui_to_id_dict['U_id'].items()},'I_id':{v:k for k,v in ui_to_id_dict['I_id'].items()}}

        dict_file = open(target_path + '/id_to_ui.dict', 'wb') 
        pickle.dump(id_to_ui_dict,dict_file)
        dict_file.close()

        dict_file = open(target_path + '/dataset_size.txt', 'w') 
        dict_file.write("{} {}".format(str(len(ui_to_id_dict['U_id'])),str(len(ui_to_id_dict['I_id']))))
        dict_file.close()
    
    return ui_to_id_dict,id_to_ui_dict


'''
功能: 将dataframe转换为 U_id,I_id 通过 ui_to_id_dict map过后的 Uid list(Iid) 形式分别保存正负样本
input: 
    dataframe
    ui_to_id_dict {'U_id':{Uid:mapped_Uid}, 'I_id':{Iid:mapped_Iid}}
    pos_traget_path,neg_traget_path 保存路径
'''
def df_to_uipair_txt(df,ui_to_id_dict,pos_traget_path,neg_traget_path):
    df_temp = df.copy()
    df_temp['U_id'] = df_temp['U_id'].apply(lambda x:ui_to_id_dict['U_id'][x])
    df_temp['I_id'] = df_temp['I_id'].apply(lambda x:ui_to_id_dict['I_id'][x])
    
    df_temp = df_temp.astype(str)
    df_temp = df_temp.groupby(['U_id','label'], as_index=False).apply(lambda x: '\t'.join(x['I_id']))
    df_temp = pd.DataFrame(df_temp,columns=['I_id']).reset_index()
    df_temp[df_temp['label']=='1'][['U_id','I_id']].to_csv(pos_traget_path, header=None, index=False, sep='\t', quoting=csv.QUOTE_NONE, escapechar=' ')
    df_temp[df_temp['label']=='0'][['U_id','I_id']].to_csv(neg_traget_path, header=None, index=False, sep='\t', quoting=csv.QUOTE_NONE, escapechar=' ')


'''
功能: 将dataframe转换为uipair_dict
input: 
    dataframe
    uipair_dict {U_id: list(I_id)}
'''
def df_to_uipair_dict(df):
    df_temp = df.copy()[['U_id','I_id']]
    uipair_dict = df_temp.groupby('U_id')['I_id'].apply(list).to_dict()
    return uipair_dict


'''
功能: 递归筛选item和user使得保留下来的每个item和user均拥有不小于k的interaction
input: 
    interaction_dict {U_id: list(I_id)}
    k 阈值
output:
    interaction_dict 筛选后的interaction_dict
    len(interaction_dict) 筛选后user个数
    item_num  筛选item个数
'''           
def select_kcore(interaction_dict,K=3):
    flag = 0
    while flag==0:
        item_cnt_dict = {}
        item_drop_dict = {}
        # create item_drop_dict, item_cnt_dict
        for user_id in interaction_dict:
            for item_id in interaction_dict[user_id]:
                if item_id not in item_cnt_dict:
                    item_cnt_dict[item_id] = 0
                    item_drop_dict[item_id] = 0
                item_cnt_dict[item_id] += 1

        #print('user num:',len(interaction_dict))
        assert len(item_drop_dict)==len(item_cnt_dict)

        # delete items < K
        del_iid_list = []
        for i_id in item_cnt_dict:
            if item_cnt_dict[i_id] < K:
                del_iid_list.append(i_id)

        for i_id in del_iid_list:
            item_drop_dict[i_id] = 1
        for u_id in interaction_dict:
            del_id_list = []
            for i_id in interaction_dict[u_id]:
                if item_drop_dict[i_id]:
                    del_id_list.append(i_id)
            for del_id in del_id_list:
                if del_id in interaction_dict[u_id]:
                    interaction_dict[u_id].remove(del_id)

        item_drop_num = 0
        for i_id in item_drop_dict:
            item_drop_num += item_drop_dict[i_id]
        item_num = len(item_drop_dict) - item_drop_num
        #print(f'item num after item-{K}core:',item_num)

        new_item_cnt = {}
        min_cnt=999
        for u_id in interaction_dict:
            min_cnt = min(min_cnt, len(interaction_dict[u_id]))
            for i_id in interaction_dict[u_id]:
                if i_id not in new_item_cnt:
                    new_item_cnt[i_id] = 0
                new_item_cnt[i_id] += 1
        #print('min user interaction:',min_cnt)
        min_cnt_item = 999
        for i_id in new_item_cnt:
            min_cnt_item = min(min_cnt_item, new_item_cnt[i_id])
        #print('min item num:',min_cnt_item)
        if min_cnt>=K and min_cnt_item>=K:
            return interaction_dict, len(interaction_dict), item_num
        
        # delete users interactions<K
        del_uid_list = []
        for u_id in interaction_dict:
            if len(interaction_dict[u_id])<K:
                del_uid_list.append(u_id)
        for u_id in del_uid_list:
            del interaction_dict[u_id]
        
        # count min user-interaction and item appearance
        new_item_cnt = {}
        min_cnt=999
        for u_id in interaction_dict:
            min_cnt = min(min_cnt, len(interaction_dict[u_id]))
            for i_id in interaction_dict[u_id]:
                if i_id not in new_item_cnt:
                    new_item_cnt[i_id] = 0
                new_item_cnt[i_id] += 1
        min_cnt_item = 999
        for i_id in new_item_cnt:
            min_cnt_item = min(min_cnt_item, new_item_cnt[i_id])
            
        if min_cnt>=K and min_cnt_item>=K:
            return interaction_dict, len(interaction_dict), item_num


'''
功能: 调用select_kcore筛选 user item, 并得到筛选后的dataframe
input: 
    datframe
    interaction_dict {U_id: list(I_id)}
    k 阈值
output:
    df_remove_sparse 筛选后的dataframe
    not_sparse_dict {'U_id': set(Uid), 'I_id': set(Iid)}筛选后留下的user item
'''  
def remove_sparse_interaction(df,uipair_dict,K=3):
    interaction_dict, not_sparse_user, not_sparse_item = select_kcore(uipair_dict,K)
    
    not_sparse_dict = {'U_id':set([]),'I_id':set([])}
    
    for k,v in interaction_dict.items():
        not_sparse_dict['U_id'].add(k)
        for i in v:
            not_sparse_dict['I_id'].add(i)

    df_remove_sparse = df[(df['U_id'].isin(not_sparse_dict['U_id'])) & (df['I_id'].isin(not_sparse_dict['I_id']))].copy()
    return df_remove_sparse, not_sparse_dict


'''
功能: 读取Uid list(Iid) txt文件 转化为ui_pairs_dict
input: 
    filename 路径
output:
    ui_pairs_dict {Uid: list(Iid)}
'''
def get_dict(filename):
    ui_pairs_dict = {}
    for line in open(filename):
        each = [int(x) for x in line.strip().split("\t")]
        ui_pairs_dict[each[0]] = each[1:]
    return ui_pairs_dict



## unique clean raw data

In [None]:
# 读取原始dataframe数据
base_path = "/storage/wjwang/share/Huawei/Huawei_data_2023"
dcn_data_raw = pd.read_csv(base_path + '/dcn_data.txt', sep='\t', header=None)

# drop掉前3列
dcn_data_pro_df = dcn_data_raw.drop(columns=[0,1,2],inplace = False)
# 从第49列中读取出 U_id I_id date label 信息, 拆分保存
dcn_data_pro_df[['date', 'label','U_id','I_id','Other']]= dcn_data_pro_df[48].str.split('#', expand=True)
dcn_data_pro_df.drop(columns=[48,'Other'],inplace = True)

# 筛选(U,I,label)第一天, 只保留相同 U_id I_id label 重复的第一天的信息
extracted_raw_unique_df = dcn_data_pro_df.groupby(['U_id', 'I_id', 'label'], as_index=False).apply(lambda x: min(x['date']))
extracted_raw_unique_df = pd.DataFrame(extracted_raw_unique_df,columns=['date'])
extracted_raw_unique_df = extracted_raw_unique_df.reset_index()

# 筛选(U,I,label)第一天中正样本, 只保留相同 U_id I_id 交互中正样本的信息
temp_splited_unique = extracted_raw_unique_df.groupby(['U_id', 'I_id'], as_index=False)['label'].max()
temp_splited_unique = pd.DataFrame(temp_splited_unique,columns=['U_id', 'I_id', 'label'])
temp_splited_unique = temp_splited_unique.reset_index()[['U_id', 'I_id', 'label']]
extracted_raw_clean_df = pd.merge(temp_splited_unique,extracted_raw_unique_df,on = ['U_id', 'I_id', 'label'])

## item_feature concat

In [None]:
# 读取item_feature dataframe并根据 I_id 对应拼接

item_features_raw = pd.read_csv(base_path+'/item_feature.txt', sep='\t')

item_features_raw_temp = item_features_raw.rename(columns={'doc_id':'I_id'},inplace=False)

dcn_item_concated_df = pd.merge(dcn_raw_clean_df_temp,item_features_raw_temp,on = ['I_id'], how = 'left')

## Feature Processing

In [None]:
# drop 掉 huawei 未使用的feature
dcn_item_concated_df.drop(columns=[32,'static_quality_score'],inplace = True) 

# mask 掉出现次数小于10的稀疏 feature value, 0.01%的概率 mask 掉一些 feature value
mincount = 10
maskrate = 0.01
mask_feature_dict = {}

if os.path.exists('./mincount{}_maskrate{}_mask_feature.dict'.format(str(mincount),str(maskrate))):
    print('loading from file')
    with open('./mincount{}_maskrate{}_mask_feature.dict'.format(str(mincount),str(maskrate)),'rb') as f:
        mask_feature_dict = pickle.load(f)
else:
    print('generating mask_feature_dict')
    for i in raw_feature_analyse_dict.keys():
        if i not in [7,30,31,34,32,48]:
            if i not in [3,4]:
                mask_feature_dict[i] = {}
                for k in list(raw_feature_analyse_dict[i]['info'][raw_feature_analyse_dict[i]['info'].values < mincount].index): 
                    mask_feature_dict[i][k] = 'mincount_feature'
                for k in list(raw_feature_analyse_dict[i]['info'][raw_feature_analyse_dict[i]['info'].values >= mincount].index): 
                    if random.randint(1,10000)<= maskrate*10000:
                        mask_feature_dict[i][k] = 'maskrate_feature'
            else:
                mask_feature_dict[i] = {}
                for k in list(raw_feature_analyse_dict[i]['info'][raw_feature_analyse_dict[i]['info'].values < mincount].index): 
                    mask_feature_dict[i][k] = -999
                for k in list(raw_feature_analyse_dict[i]['info'][raw_feature_analyse_dict[i]['info'].values >= mincount].index): 
                    if random.randint(1,10000)<= maskrate*10000:
                        mask_feature_dict[i][k] = -9999

    for i in item_feature_analyse_dict.keys():
        if i not in ['static_quality_score']:
            if i not in ['ori_cpid', 'exposure_times', 'ctr_score']:
                mask_feature_dict[i] = {}
                for k in list(item_feature_analyse_dict[i]['info'][item_feature_analyse_dict[i]['info'].values < mincount].index):
                    mask_feature_dict[i][k] = 'mincount_feature'
                for k in list(item_feature_analyse_dict[i]['info'][item_feature_analyse_dict[i]['info'].values >= mincount].index):
                    if random.randint(1,10000)<= maskrate*10000:
                        mask_feature_dict[i][k] = 'maskrate_feature'
            else:
                mask_feature_dict[i] = {}
                for k in list(item_feature_analyse_dict[i]['info'][item_feature_analyse_dict[i]['info'].values < mincount].index):
                    mask_feature_dict[i][k] = -999
                for k in list(item_feature_analyse_dict[i]['info'][item_feature_analyse_dict[i]['info'].values >= mincount].index):
                    if random.randint(1,10000)<= maskrate*10000:
                        mask_feature_dict[i][k] = -9999


    statistic_dict_file = open('./mincount{}_maskrate{}_mask_feature.dict'.format(str(mincount),str(maskrate)), 'wb') 
    pickle.dump(mask_feature_dict,statistic_dict_file)
    statistic_dict_file.close()

In [None]:
# 部分连续变量按照 huawei 方提供的分段值分段分类处理
bins = {34:[-1,0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,10.0,20.0,30.0,9999],\
        31:[-1,0,1.01,1.25,1.34,1.5,2.0,2.32,3.0,3.5,4.0,4.49,4.99,5.99,7.99,10.49,33.53,49.59,9999],\
        30:[-1,0,54.76,63.08,64.13,64.95,70.94,74.05,74.43,75.15,75.71,9999],\
        7:[-1,0,1.66,3.37,10.26,12.91,14.59,25.71,27.85,42.29,73.34,9999],\
       }

default_replace_dict = {30:{'user_feed_avg_doc_max_completion_daily_x':'-1'},\
                  31:{'user_feed_avg_vv_daily_x':'-1'},\
                  34:{'user_last_used_since_today_days_x':'-1'}}

dcn_item_concated_df.replace(default_replace_dict,inplace=True)

dcn_item_concated_df[[7,30,31,34]] = dcn_item_concated_df[[7,30,31,34]].astype(float)

for k,v in bins.items():
    dcn_item_concated_df[k] = pd.cut(dcn_item_concated_df[k],v,include_lowest=True,right = False,labels = [i for i in range(len(v)-1)])

for k,v in mask_feature_dict.items():
    print('Replacing ',k,':')
    start_time = time.time()
    df_mat = dcn_item_concated_df[k].values
    df_mat[:] = [v[x] if x in v else x for x in df_mat]
    dcn_item_concated_df.loc[:,k]=df_mat
    end_time = time.time()
    print('time cost:',float(end_time-start_time)*1000,'ms')

## 去除sparse user, 拆分train valid test

In [None]:
# 去除掉 sparse 的user, 即交互数量小于等于 sparse 的 user
dcn_data_clean_7days_dropped = dcn_item_concated_df.copy()

top=0
sparse = 10

uid_inter_count_7days_dropped = dcn_data_clean_7days_dropped.groupby('U_id',as_index=False).count()
uid_inter_count_7days_dropped = uid_inter_count_7days_dropped[['U_id','I_id']]
uid_inter_count_7days_dropped.rename(columns={'U_id':'U_id','I_id': 'count'}, inplace=True)
uid_inter_count_7days_dropped = uid_inter_count_7days_dropped[uid_inter_count_7days_dropped['count']>sparse]
dcn_data_clean_7days_dropped_sparse = dcn_data_clean_7days_dropped.loc[dcn_data_clean_7days_dropped['U_id'].isin(uid_inter_count_7days_dropped['U_id'].values)]

dcn_data_clean_7days_dropped_sparse['date'] = pd.to_datetime(dcn_data_clean_7days_dropped_sparse['date'],format='%Y%m%d')

# 设定导出数据集的路径
target_path = '/storage/shjing/xinyang/Huawei_data_2023/datasets/feature-0125-0131_itemfeature_mincount{}_maskrate{}_total_usersparse{}_top{}_temp'.format(str(mincount),str(maskrate),str(sparse),str(top))
if not os.path.exists(target_path):
    os.makedirs(target_path)

df_alldays_unique_remove_sparse = dcn_data_clean_7days_dropped_sparse.copy()

# k-core, 删掉sparse user和item, 返回str id的 dataframe和ui交互dict
remove_top_uipair_alldays_dict = df_to_uipair_dict(df_alldays_unique_remove_sparse)
df_alldays_unique_remove_sparse, not_sparse_dict = remove_sparse_interaction(df_alldays_unique_remove_sparse,remove_top_uipair_alldays_dict,0)

# 生成map和reverse map
ui_to_id_dict_alldays_sparse,id_to_ui_dict_alldays_sparse = save_ui_dict(not_sparse_dict,target_path)

df_alldays_unique_statistic_dict = count_data_info(df_alldays_unique_remove_sparse)
statistic_dict_file = open(target_path + '/df_all_unique_statistic.dict', 'wb') 
pickle.dump(df_alldays_unique_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print("Final Dataset:")
print("interactions:",df_alldays_unique_statistic_dict["interactions"],"\nusers:",len(df_alldays_unique_statistic_dict['U_id']),"\nitems:",len(df_alldays_unique_statistic_dict['I_id']),"\nlabels:",df_alldays_unique_statistic_dict['label'])

In [None]:
train_start_time =  pd.Timestamp('2023-01-25')
train_end_time = pd.Timestamp('2023-01-29')
val_start_time = pd.Timestamp("2023-01-30")
val_end_time = pd.Timestamp("2023-01-30")
test_start_time = pd.Timestamp("2023-01-31")
test_end_time = pd.Timestamp("2023-01-31")

In [None]:
# train 数据集构建
df_train = df_alldays_unique_remove_sparse[(df_alldays_unique_remove_sparse['date']>=train_start_time) & (df_alldays_unique_remove_sparse['date']<=train_end_time)].copy()

df_alldays_train_statistic_dict = count_data_info(df_train)
statistic_dict_file = open(target_path + '/df_train_statistic.dict', 'wb') 
pickle.dump(df_alldays_train_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("train:")
print("interactions:",df_alldays_train_statistic_dict["interactions"],"\nusers:",len(df_alldays_train_statistic_dict['U_id']),"\nitems:",len(df_alldays_train_statistic_dict['I_id']),"\nlabels:",df_alldays_train_statistic_dict['label'])

df_to_uipair_txt(df_train,ui_to_id_dict_alldays_sparse,target_path+'/train_pos.txt',target_path+'/train_neg.txt')


# valid 数据集构建
# valid
df_val = df_alldays_unique_remove_sparse[(df_alldays_unique_remove_sparse['date']>=val_start_time) & (df_alldays_unique_remove_sparse['date']<=val_end_time)].copy()

df_alldays_val_statistic_dict = count_data_info(df_val)
statistic_dict_file = open(target_path + '/df_val_statistic.dict', 'wb') 
pickle.dump(df_alldays_val_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("val:")
print("interactions:",df_alldays_val_statistic_dict["interactions"],"\nusers:",len(df_alldays_val_statistic_dict['U_id']),"\nitems:",len(df_alldays_val_statistic_dict['I_id']),"\nlabels:",df_alldays_val_statistic_dict['label'])

df_to_uipair_txt(df_val,ui_to_id_dict_alldays_sparse,target_path+'/val_pos.txt',target_path+'/val_neg.txt')

# valid cold user cold item
df_cuci_val = cold_user_cold_item(df_val,df_alldays_train_statistic_dict)

df_alldays_val_statistic_dict = count_data_info(df_cuci_val)
statistic_dict_file = open(target_path + '/df_cuci_val_statistic.dict', 'wb') 
pickle.dump(df_alldays_val_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("val_cuci:")
print("interactions:",df_alldays_val_statistic_dict["interactions"],"\nusers:",len(df_alldays_val_statistic_dict['U_id']),"\nitems:",len(df_alldays_val_statistic_dict['I_id']),"\nlabels:",df_alldays_val_statistic_dict['label'])

df_to_uipair_txt(df_cuci_val,ui_to_id_dict_alldays_sparse,target_path+'/cuci_val_pos.txt',target_path+'/cuci_val_neg.txt')

# valid warm user cold item
df_wuci_val = warm_user_cold_item(df_val,df_alldays_train_statistic_dict)

df_alldays_val_statistic_dict = count_data_info(df_wuci_val)
statistic_dict_file = open(target_path + '/df_wuci_val_statistic.dict', 'wb') 
pickle.dump(df_alldays_val_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("val_wuci:")
print("interactions:",df_alldays_val_statistic_dict["interactions"],"\nusers:",len(df_alldays_val_statistic_dict['U_id']),"\nitems:",len(df_alldays_val_statistic_dict['I_id']),"\nlabels:",df_alldays_val_statistic_dict['label'])

df_to_uipair_txt(df_wuci_val,ui_to_id_dict_alldays_sparse,target_path+'/wuci_val_pos.txt',target_path+'/wuci_val_neg.txt')

# valid cold user warm item
df_cuwi_val = cold_user_warm_item(df_val,df_alldays_train_statistic_dict)

df_alldays_val_statistic_dict = count_data_info(df_cuwi_val)
statistic_dict_file = open(target_path + '/df_cuwi_val_statistic.dict', 'wb') 
pickle.dump(df_alldays_val_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("val_cuwi:")
print("interactions:",df_alldays_val_statistic_dict["interactions"],"\nusers:",len(df_alldays_val_statistic_dict['U_id']),"\nitems:",len(df_alldays_val_statistic_dict['I_id']),"\nlabels:",df_alldays_val_statistic_dict['label'])

df_to_uipair_txt(df_cuwi_val,ui_to_id_dict_alldays_sparse,target_path+'/cuwi_val_pos.txt',target_path+'/cuwi_val_neg.txt')

# valid warm user warm item
df_wuwi_val = warm_user_warm_item(df_val,df_alldays_train_statistic_dict)

df_alldays_val_statistic_dict = count_data_info(df_wuwi_val)
statistic_dict_file = open(target_path + '/df_wuwi_val_statistic.dict', 'wb') 
pickle.dump(df_alldays_val_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("val_wuwi:")
print("interactions:",df_alldays_val_statistic_dict["interactions"],"\nusers:",len(df_alldays_val_statistic_dict['U_id']),"\nitems:",len(df_alldays_val_statistic_dict['I_id']),"\nlabels:",df_alldays_val_statistic_dict['label'])

df_to_uipair_txt(df_wuwi_val,ui_to_id_dict_alldays_sparse,target_path+'/wuwi_val_pos.txt',target_path+'/wuwi_val_neg.txt')


# test 数据集构建
# test
df_test = df_alldays_unique_remove_sparse[(df_alldays_unique_remove_sparse['date']>=test_start_time) & (df_alldays_unique_remove_sparse['date']<=test_end_time)].copy()

df_alldays_test_statistic_dict = count_data_info(df_test)
statistic_dict_file = open(target_path + '/df_test_statistic.dict', 'wb') 
pickle.dump(df_alldays_test_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("test:")
print("interactions:",df_alldays_test_statistic_dict["interactions"],"\nusers:",len(df_alldays_test_statistic_dict['U_id']),"\nitems:",len(df_alldays_test_statistic_dict['I_id']),"\nlabels:",df_alldays_test_statistic_dict['label'])

df_to_uipair_txt(df_test,ui_to_id_dict_alldays_sparse,target_path+'/test_pos.txt',target_path+'/test_neg.txt')

# valid cold user cold item
df_cuci_test = cold_user_cold_item(df_test,df_alldays_train_statistic_dict)

df_alldays_test_statistic_dict = count_data_info(df_cuci_test)
statistic_dict_file = open(target_path + '/df_cuci_test_statistic.dict', 'wb') 
pickle.dump(df_alldays_test_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("test_cuci:")
print("interactions:",df_alldays_test_statistic_dict["interactions"],"\nusers:",len(df_alldays_test_statistic_dict['U_id']),"\nitems:",len(df_alldays_test_statistic_dict['I_id']),"\nlabels:",df_alldays_test_statistic_dict['label'])

df_to_uipair_txt(df_cuci_test,ui_to_id_dict_alldays_sparse,target_path+'/cuci_test_pos.txt',target_path+'/cuci_test_neg.txt')

# valid warm user cold item
df_wuci_test = warm_user_cold_item(df_test,df_alldays_train_statistic_dict)

df_alldays_test_statistic_dict = count_data_info(df_wuci_test)
statistic_dict_file = open(target_path + '/df_wuci_test_statistic.dict', 'wb') 
pickle.dump(df_alldays_test_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("test_wuci:")
print("interactions:",df_alldays_test_statistic_dict["interactions"],"\nusers:",len(df_alldays_test_statistic_dict['U_id']),"\nitems:",len(df_alldays_test_statistic_dict['I_id']),"\nlabels:",df_alldays_test_statistic_dict['label'])

df_to_uipair_txt(df_wuci_test,ui_to_id_dict_alldays_sparse,target_path+'/wuci_test_pos.txt',target_path+'/wuci_test_neg.txt')

# valid cold user warm item
df_cuwi_test = cold_user_warm_item(df_test,df_alldays_train_statistic_dict)

df_alldays_test_statistic_dict = count_data_info(df_cuwi_test)
statistic_dict_file = open(target_path + '/df_cuwi_test_statistic.dict', 'wb') 
pickle.dump(df_alldays_test_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("test_cuwi:")
print("interactions:",df_alldays_test_statistic_dict["interactions"],"\nusers:",len(df_alldays_test_statistic_dict['U_id']),"\nitems:",len(df_alldays_test_statistic_dict['I_id']),"\nlabels:",df_alldays_test_statistic_dict['label'])

df_to_uipair_txt(df_cuwi_test,ui_to_id_dict_alldays_sparse,target_path+'/cuwi_test_pos.txt',target_path+'/cuwi_test_neg.txt')

# valid warm user warm item
df_wuwi_test = warm_user_warm_item(df_test,df_alldays_train_statistic_dict)

df_alldays_test_statistic_dict = count_data_info(df_wuwi_test)
statistic_dict_file = open(target_path + '/df_wuwi_test_statistic.dict', 'wb') 
pickle.dump(df_alldays_test_statistic_dict,statistic_dict_file)
statistic_dict_file.close()
print()
print("test_wuwi:")
print("interactions:",df_alldays_test_statistic_dict["interactions"],"\nusers:",len(df_alldays_test_statistic_dict['U_id']),"\nitems:",len(df_alldays_test_statistic_dict['I_id']),"\nlabels:",df_alldays_test_statistic_dict['label'])

df_to_uipair_txt(df_wuwi_test,ui_to_id_dict_alldays_sparse,target_path+'/wuwi_test_pos.txt',target_path+'/wuwi_test_neg.txt')



## unnormalized popularity 计算

In [None]:
# stage 信息统计
print('generating stage_info_dict')
stage_info_dict = {}
train_stage = ['stage_'+str(i) for i in range(5)]
valid_stage = ['stage_'+str(i) for i in range(3,5)]
test_stage = ['stage_'+str(i) for i in range(4,6)]

stage_info_dict['train'] = train_stage
stage_info_dict['valid'] = valid_stage
stage_info_dict['test'] = test_stage

statistic_dict_file = open(target_path + '/stage_info_dict.dict', 'wb') 
pickle.dump(stage_info_dict,statistic_dict_file)
statistic_dict_file.close()

stage_days = [pd.Timestamp('2023-01-'+str(i)) for i in range(25,31)]
stage_to_day_map_dict = {}
day_to_stage_map_dict = {}
for i in range(len(stage_days)):
    stage_to_day_map_dict['stage_'+str(i)] = stage_days[i]
    day_to_stage_map_dict[stage_days[i]] = 'stage_'+str(i)


# 生成 stage_item_popularity_dict {stage:{I_id: popularity}}
if os.path.exists(target_path + '/stage_item_popularity_dict.dict'):
    print('loading from file')
    with open(target_path + '/stage_item_popularity_dict.dict','rb') as f:
        stage_item_popularity_dict = pickle.load(f)
else:
    print('generating stage_item_popularity_dict')
    stage_item_popularity_dict = {}
    for i in range(len(stage_days)):
        selected_day = stage_days[i]
        df_oneday = df_alldays_unique_remove_sparse[['U_id','I_id']][df_alldays_unique_remove_sparse['date']==selected_day].copy()
        iid_inter_count_oneday = df_oneday.groupby('I_id',as_index=False).count()
        iid_inter_count_oneday = iid_inter_count_oneday[['U_id','I_id']]
        iid_inter_count_oneday.rename(columns={'I_id':'I_id','U_id': 'count_'+ str(i)}, inplace=True)
        interactions_oneday = iid_inter_count_oneday['count_'+ str(i)].sum()
        iid_inter_count_oneday['count_'+ str(i)] = iid_inter_count_oneday['count_'+ str(i)]/interactions_oneday
        iid_inter_count_oneday.set_index(['I_id'], inplace = True)
        stage_item_popularity_dict['stage_'+str(i)] = iid_inter_count_oneday.to_dict('dict')['count_'+ str(i)]

        statistic_dict_file = open(target_path + '/stage_item_popularity_dict.dict', 'wb') 
        pickle.dump(stage_item_popularity_dict,statistic_dict_file)
        statistic_dict_file.close()

# 生成 map 后 item id 对应的stage popularity stage_iid_popularity_dict {stage:{Iid: popularity}}
print('generating stage_iid_popularity_dict')
stage_iid_popularity_dict = {}
for k,v in stage_item_popularity_dict.items():
    stage_iid_popularity_dict[k] = {}
    for k1,v1 in v.items():
        stage_iid_popularity_dict[k][ui_to_id_dict_alldays_sparse['I_id'][k1]] = v1

statistic_dict_file = open(target_path + '/stage_iid_popularity_dict.dict', 'wb') 
pickle.dump(stage_iid_popularity_dict,statistic_dict_file)
statistic_dict_file.close()


# 生成 stage 默认值dict stage_default_popularity_dict {stage: default_popularity}
if os.path.exists(target_path + '/stage_default_popularity_dict.dict'):
    print('loading from file')
    with open(target_path + '/stage_default_popularity_dict.dict','rb') as f:
        stage_default_popularity_dict = pickle.load(f)
else:
    print('generating stage_default_popularity_dict')
    stage_default_popularity_dict = {}
    for i in range(len(stage_days)):
        stage_default_popularity_dict['stage_'+str(i)] = 1/len(stage_item_popularity_dict['stage_'+str(i)])

    statistic_dict_file = open(target_path + '/stage_default_popularity_dict.dict', 'wb') 
    pickle.dump(stage_default_popularity_dict,statistic_dict_file)
    statistic_dict_file.close()


# 生成 map 后 (uid, iid) pair对应的 popularity (处理后的数据集 ui pair 对应的时间是唯一的) mappedui_to_popularity_dict {(uid,iid):popularitys}
temp_df_uidate_popular = df_alldays_unique_remove_sparse_encoded[['U_id','I_id','date']]

if os.path.exists(target_path + '/mappedui_to_popularity_dict.dict'):
    print('loading from file')
    with open(target_path + '/mappedui_to_popularity_dict.dict','rb') as f:
        mappedui_to_popularity_dict = pickle.load(f)
else:
    print('generating mappedui_to_popularity_dict')
    mappedui_to_popularity_dict = {}
    for idx, row in temp_df_uidate_popular.iterrows():
        if row['date'] in day_to_stage_map_dict:
             mappedui_to_popularity_dict[(ui_to_id_dict_alldays_sparse['U_id'][row['U_id']],ui_to_id_dict_alldays_sparse['I_id'][row['I_id']])] = stage_item_popularity_dict[day_to_stage_map_dict[row['date']]][row['I_id']]

    statistic_dict_file = open(target_path + '/mappedui_to_popularity_dict.dict', 'wb') 
    pickle.dump(mappedui_to_popularity_dict,statistic_dict_file)
    statistic_dict_file.close()

## normalized popularity 计算

In [None]:
if not os.path.exists(target_path):
    os.makedirs(target_path)

print('generating stage_info_dict')
stage_info_dict = {}
train_stage = ['stage_'+str(i) for i in range(5)]
valid_stage = ['stage_'+str(i) for i in range(3,5)]
test_stage = ['stage_'+str(i) for i in range(4,6)]

stage_info_dict['train'] = train_stage
stage_info_dict['valid'] = valid_stage
stage_info_dict['test'] = test_stage

statistic_dict_file = open(target_path + '/stage_info_dict.dict', 'wb') 
pickle.dump(stage_info_dict,statistic_dict_file)
statistic_dict_file.close()

stage_days = [pd.Timestamp('2023-01-'+str(i)) for i in range(25,31)]
stage_to_day_map_dict = {}
day_to_stage_map_dict = {}
for i in range(len(stage_days)):
    stage_to_day_map_dict['stage_'+str(i)] = stage_days[i]
    day_to_stage_map_dict[stage_days[i]] = 'stage_'+str(i)

df_train_valid = df_alldays_unique_remove_sparse[(df_alldays_unique_remove_sparse['date']>=train_start_time) & (df_alldays_unique_remove_sparse['date']<=val_end_time)].copy()

train_iid = df_train_valid['I_id'].values
n_items = len(train_iid)


if os.path.exists(target_path + '/stage_item_popularity_dict.dict'):
    print('loading from file')
    with open(target_path + '/stage_item_popularity_dict.dict','rb') as f:
        stage_item_popularity_dict = pickle.load(f)
else:
    print('generating stage_item_popularity_dict')
    stage_item_popularity_dict = {}
    interactions_stage = {}
    for i in range(len(stage_days)):
        selected_day = stage_days[i]
        df_oneday = df_alldays_unique_remove_sparse[['U_id','I_id']][df_alldays_unique_remove_sparse['date']==selected_day].copy()
        iid_inter_count_oneday = df_oneday.groupby('I_id',as_index=False).count()
        iid_inter_count_oneday = iid_inter_count_oneday[['U_id','I_id']]
        iid_inter_count_oneday.rename(columns={'I_id':'I_id','U_id': 'count_'+ str(i)}, inplace=True)
        interactions_oneday = iid_inter_count_oneday['count_'+ str(i)].sum()
        interactions_stage['stage_'+str(i)] = interactions_oneday
        iid_inter_count_oneday['count_'+ str(i)] = iid_inter_count_oneday['count_'+ str(i)]
        iid_inter_count_oneday.set_index(['I_id'], inplace = True)
        stage_item_popularity_dict['stage_'+str(i)] = iid_inter_count_oneday.to_dict('dict')['count_'+ str(i)]
    
    temp_dict = {}
    for k,v in stage_item_popularity_dict.items():
        temp_dict[k] = {}
        for iid in train_iid:
            if iid in v:
                temp_dict[k][iid] = (1+stage_item_popularity_dict[k][iid])/(interactions_stage[k]+n_items)
            else:
                temp_dict[k][iid] = 1/(interactions_stage[k]+n_items)
    stage_item_popularity_dict=temp_dict.copy()
    
    deno_normal_dict = {}
    temp_dict = {}
    for k,v in stage_item_popularity_dict.items():
        temp_dict[k] = {}
        deno_normal = max(list(v.values())) - min(list(v.values()))
        min_value = min(list(v.values()))
        deno_normal_dict[k] = deno_normal
        for k1,v1 in v.items():
            temp_dict[k][k1] = (v1-min_value)/deno_normal
    stage_item_popularity_dict=temp_dict.copy()
        
    statistic_dict_file = open(target_path + '/stage_item_popularity_dict.dict', 'wb') 
    pickle.dump(stage_item_popularity_dict,statistic_dict_file)
    statistic_dict_file.close()


print('generating stage_iid_popularity_dict')
stage_iid_popularity_dict = {}
for k,v in stage_item_popularity_dict.items():
    stage_iid_popularity_dict[k] = {}
    for k1,v1 in v.items():
        stage_iid_popularity_dict[k][ui_to_id_dict_alldays_sparse['I_id'][k1]] = v1

statistic_dict_file = open(target_path + '/stage_iid_popularity_dict.dict', 'wb') 
pickle.dump(stage_iid_popularity_dict,statistic_dict_file)
statistic_dict_file.close()

if os.path.exists(target_path + '/stage_default_popularity_dict.dict'):
    print('loading from file')
    with open(target_path + '/stage_default_popularity_dict.dict','rb') as f:
        stage_default_popularity_dict = pickle.load(f)
else:
    print('generating stage_default_popularity_dict')
    stage_default_popularity_dict = {}
    for i in range(len(stage_days)):
        stage_default_popularity_dict['stage_'+str(i)] = 0
        
        
    statistic_dict_file = open(target_path + '/stage_default_popularity_dict.dict', 'wb') 
    pickle.dump(stage_default_popularity_dict,statistic_dict_file)
    statistic_dict_file.close()


temp_df_uidate_popular = df_alldays_unique_remove_sparse_encoded[['U_id','I_id','date']]

if os.path.exists(target_path + '/mappedui_to_popularity_dict.dict'):
    print('loading from file')
    with open(target_path + '/mappedui_to_popularity_dict.dict','rb') as f:
        mappedui_to_popularity_dict = pickle.load(f)
else:
    print('generating mappedui_to_popularity_dict')
    mappedui_to_popularity_dict = {}
    for idx, row in temp_df_uidate_popular.iterrows():
        if row['date'] in day_to_stage_map_dict:
            ui_index = (ui_to_id_dict_alldays_sparse['U_id'][row['U_id']],ui_to_id_dict_alldays_sparse['I_id'][row['I_id']])
            mappedui_to_popularity_dict[ui_index] = stage_item_popularity_dict[day_to_stage_map_dict[row['date']]][row['I_id']]

    statistic_dict_file = open(target_path + '/mappedui_to_popularity_dict.dict', 'wb') 
    pickle.dump(mappedui_to_popularity_dict,statistic_dict_file)
    statistic_dict_file.close()

## 重新map feature

In [None]:
# 重新对每个 feature 的 feature value 进行 map
def label_encode_columns(df, columns):
    encoders = {}
    for col in columns:
        le = preprocessing.LabelEncoder().fit(df[col])
        df[col] = le.transform(df[col])
        encoders[col] = le
    return df, encoders

def label_encode_columns_w_fit_encoders(df, columns, encoders):
    for col in columns:
        le = encoders.get(col)
        df[col] = le.transform(df[col])
    return df

encode_columns =  [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
                    26, 27, 28, 29,  30, 31, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
                    'entities',  'ori_cpid', 'category_tags', 'tags', 'topics', 'category_entities',
                    'category',  'multi_label', 'doc_publish_time', 'title_tags', 'exposure_times', 'ctr_score']

df_alldays_unique_remove_sparse_encoded, encoders = label_encode_columns(df=df_alldays_unique_remove_sparse, columns=encode_columns)

## 导出feature

In [None]:
u_feat_prefix = {'U_id':'U',7:'UFrefreshn',8:'UFgender',\
                 9:'UFforecast_age',10:'UFdevice_price',\
                 11:'UFprovince_dev',12:'UFpush_online_days_30d_dev',13:'UFpush_online_days_7d_dev',\
                 14:'UFcity_new_dev',15:'UFcity_new_grade_dev',\
                 16:'UFuser_feed_click_doc_cats_30days',17:'UFuser_feed_click_doc_topics_30days',\
                 18:'UFuser_feed_click_doc_keys_30days',19:'UFuser_feed_click_doc_entities_30days',\
                 20:'UFuser_feed_read_doc_cats_30days',21:'UFuser_feed_read_doc_topics_30days',\
                 22:'UFuser_feed_read_doc_keys_30days',23:'user_feed_read_doc_entities_30days',\
                 24:'UFuser_feed_click_svideo_cats_30days',25:'UFuser_feed_click_svideo_keys_30days',\
                 26:'UFuser_feed_play_svideo_cats_30days',27:'UFuser_feed_play_svideo_keys_30days',\
                 28:'UFuser_feed_user_life_cycle',\
                 29:'UFuser_feed_avg_doc_reading_daily_bin',30:'UFuser_feed_avg_doc_max_completion_daily',\
                 31:'UFuser_feed_avg_vv_daily',33:'UFuser_feed_avg_refresh_daily',\
                 34:'UFuser_last_used_since_today_days',35:'UFuser_feed_explore_doc_cats',\
                 36:'UFuser_feed_explore_doc_topics',37:'UFuser_feed_explore_doc_keys',\
                 38:'UFuser_feed_click_app_cats',\
                }

i_feat_prefix = {'I_id':'I',3:'IFdtype',4:'IFacinnerpos',\
                 5:'IFe_ch',6:'IFsource',\
                 39:'IFentities',40:'IFcategory_tags',41:'IFtags',\
                 42:'IFtopics',43:'IFcategory_entities',\
                 44:'IFmanual_category',45:'IFstatic_quality_score_bin',\
                 46:'IFctr_score_bin',47:'IFexposure_times_bin',\
                 'entities':'IFNewentities','ori_cpid':'IFNewori_cpid',\
                 'category_tags':'IFNewcategory_tags','tags':'IFNewtags','topics':'IFNewtopics',\
                 'category_entities':'IFNewcategory_entities','category':'IFNewcategory',\
                 'multi_label':'IFNewmulti_label', 'doc_publish_time':'IFNewdoc_publish_time',\
                 'title_tags':'IFNewtitle_tags', 'exposure_times':'IFNewexposure_times', 'ctr_score':'IFNewctr_score',\
                }

ctx_feat_prefix = {}


# 生成导出所有user,item的feature文件
print()
print('Start generating feature file...')
user_feature = {} 
item_feature = {} 
ctx_feature = {}

for idx, row in df_alldays_unique_remove_sparse_encoded.iterrows():
    if ui_to_id_dict_alldays_sparse['U_id'][row['U_id']] not in user_feature:
        user_feature[ui_to_id_dict_alldays_sparse['U_id'][row['U_id']]] = [[],[]]
        for column in u_feat_prefix.keys():
            if column == 'U_id':
                user_feature[ui_to_id_dict_alldays_sparse['U_id'][row['U_id']]][0].append('U'+str(ui_to_id_dict_alldays_sparse['U_id'][row['U_id']]))
                user_feature[ui_to_id_dict_alldays_sparse['U_id'][row['U_id']]][1].append(1)
            else:
                user_feature[ui_to_id_dict_alldays_sparse['U_id'][row['U_id']]][0].append(u_feat_prefix[column]+str(row[column]))
                user_feature[ui_to_id_dict_alldays_sparse['U_id'][row['U_id']]][1].append(1)
                
    if ui_to_id_dict_alldays_sparse['I_id'][row['I_id']] not in item_feature:
        item_feature[ui_to_id_dict_alldays_sparse['I_id'][row['I_id']]] = [[],[]]
        for column in i_feat_prefix.keys():
            if column == 'I_id':
                item_feature[ui_to_id_dict_alldays_sparse['I_id'][row['I_id']]][0].append('I'+str(ui_to_id_dict_alldays_sparse['I_id'][row['I_id']]))
                item_feature[ui_to_id_dict_alldays_sparse['I_id'][row['I_id']]][1].append(1)
            else:
                item_feature[ui_to_id_dict_alldays_sparse['I_id'][row['I_id']]][0].append(i_feat_prefix[column]+str(row[column]))
                item_feature[ui_to_id_dict_alldays_sparse['I_id'][row['I_id']]][1].append(1)
                
    if (ui_to_id_dict_alldays_sparse['U_id'][row['U_id']],ui_to_id_dict_alldays_sparse['I_id'][row['I_id']]) not in item_feature:
        
        ctx_feature[(ui_to_id_dict_alldays_sparse['U_id'][row['U_id']],\
                     ui_to_id_dict_alldays_sparse['I_id'][row['I_id']])] = [[],[]]
        for column in ctx_feat_prefix.keys():
            ctx_feature[(ui_to_id_dict_alldays_sparse['U_id'][row['U_id']],\
                         ui_to_id_dict_alldays_sparse['I_id'][row['I_id']])][0].append(ctx_feat_prefix[column]+str(row[column]))

            ctx_feature[(ui_to_id_dict_alldays_sparse['U_id'][row['U_id']],\
                         ui_to_id_dict_alldays_sparse['I_id'][row['I_id']])][1].append(1)
        
np.save(target_path+'/user_feature.npy',user_feature)
np.save(target_path+'/item_feature.npy',item_feature)
np.save(target_path+'/ctx_feature.npy',ctx_feature)
print('Generating feature file done.')

# 从txt导成npy的dict
print()
print('Start generating interaction dict.npy...')

train_pos = get_dict(target_path+'/train_pos.txt')
train_neg = get_dict(target_path+'/train_neg.txt')

np.save(target_path+'/training_pos.npy',train_pos)
np.save(target_path+'/training_neg.npy',train_neg)

valid_pos = get_dict(target_path+'/cuci_val_pos.txt')
valid_neg = get_dict(target_path+'/cuci_val_neg.txt')
test_pos = get_dict(target_path+'/cuci_test_pos.txt')
test_neg = get_dict(target_path+'/cuci_test_neg.txt')
np.save(target_path+'/cuci_validation_pos.npy',valid_pos)
np.save(target_path+'/cuci_validation_neg.npy',valid_neg)
np.save(target_path+'/cuci_testing_pos.npy',test_pos)
np.save(target_path+'/cuci_testing_neg.npy',test_neg)

valid_pos = get_dict(target_path+'/wuci_val_pos.txt')
valid_neg = get_dict(target_path+'/wuci_val_neg.txt')
test_pos = get_dict(target_path+'/wuci_test_pos.txt')
test_neg = get_dict(target_path+'/wuci_test_neg.txt')
np.save(target_path+'/wuci_validation_pos.npy',valid_pos)
np.save(target_path+'/wuci_validation_neg.npy',valid_neg)
np.save(target_path+'/wuci_testing_pos.npy',test_pos)
np.save(target_path+'/wuci_testing_neg.npy',test_neg)

valid_pos = get_dict(target_path+'/cuwi_val_pos.txt')
valid_neg = get_dict(target_path+'/cuwi_val_neg.txt')
test_pos = get_dict(target_path+'/cuwi_test_pos.txt')
test_neg = get_dict(target_path+'/cuwi_test_neg.txt')
np.save(target_path+'/cuwi_validation_pos.npy',valid_pos)
np.save(target_path+'/cuwi_validation_neg.npy',valid_neg)
np.save(target_path+'/cuwi_testing_pos.npy',test_pos)
np.save(target_path+'/cuwi_testing_neg.npy',test_neg)

valid_pos = get_dict(target_path+'/wuwi_val_pos.txt')
valid_neg = get_dict(target_path+'/wuwi_val_neg.txt')
test_pos = get_dict(target_path+'/wuwi_test_pos.txt')
test_neg = get_dict(target_path+'/wuwi_test_neg.txt')
np.save(target_path+'/wuwi_validation_pos.npy',valid_pos)
np.save(target_path+'/wuwi_validation_neg.npy',valid_neg)
np.save(target_path+'/wuwi_testing_pos.npy',test_pos)
np.save(target_path+'/wuwi_testing_neg.npy',test_neg)

print('Generating interaction dict.npy done.')