## 模型训练  


In [1]:
import pandas as pd
import os
import gc
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

### 导入数据

In [2]:
disease_feature1 = pd.read_csv("/home/mw/input/data8766/训练集/训练集/disease_feature1.csv")
disease_feature2 = pd.read_csv("/home/mw/input/data8766/训练集/训练集/disease_feature2.csv")
disease_feature3 = pd.read_csv("/home/mw/input/data8766/训练集/训练集/disease_feature3.csv")

train_answer = pd.read_csv("/home/mw/input/data8766/训练集/训练集/train_answer.csv")
train_food = pd.read_csv("/home/mw/input/data8766/训练集/训练集/train_food.csv")
semi_train_answer = pd.read_csv("/home/mw/input/data6690/semi_train_answer.csv")
semi_train_answer

Unnamed: 0,food_id,disease_id,related_score
0,food_0,disease_208,0
1,food_0,disease_1166,0
2,food_0,disease_1418,0
3,food_0,disease_76,0
4,food_0,disease_579,0
...,...,...,...
141631,food_97,disease_685,0
141632,food_97,disease_325,0
141633,food_97,disease_911,0
141634,food_97,disease_961,0


In [3]:
data = semi_train_answer
data.head()

Unnamed: 0,food_id,disease_id,related_score
0,food_0,disease_208,0
1,food_0,disease_1166,0
2,food_0,disease_1418,0
3,food_0,disease_76,0
4,food_0,disease_579,0


### 食物、疾病特征编码

In [4]:
data['food'] = data['food_id'].apply(lambda x : int(x.split('_')[1]))
data['disease'] = data['disease_id'].apply(lambda x : int(x.split('_')[1]))

### 定义峰度、偏度函数，方便后续调用进行计算

In [5]:
def skew(x): return x.skew()
def kurtosis(x): return x.kurtosis()

### 按照疾病ID分组，计算每个疾病与所有食物的相关性的四个统计量（均值、标准差、偏度、峰度）

In [6]:
# 定义一个列表，包含需要进行统计的列名
cat_list = ['disease']

# 定义一个函数，对数据进行统计
def stat(df, df_merge, group_by, agg):
    # 对数据进行分组，并进行聚合操作
    group = df.groupby(group_by).agg(agg)

    # 生成新的列名
    columns = []
    for on, methods in agg.items():
        for method in methods:
            if method == kurtosis:
                method = 'kurtosis'
            elif method == skew:
                method = 'skew'
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
        group.columns = columns
        group.reset_index(inplace=True)
        df_merge = df_merge.merge(group, on=group_by, how='left')

    # 删除变量，释放内存
    del (group)
    gc.collect()
    return df_merge

# 对数据进行预处理
df_train = data[~data['related_score'].isnull()]
df_train = df_train.reset_index(drop=True)

# 对数据进行统计
df_stas_feat = stat(df_train, df_train, cat_list, {'related_score': ['mean','std',skew, kurtosis]})

data = pd.concat([df_stas_feat], axis=0)
data = data.reset_index(drop=True)

del (df_stas_feat)
del (df_train)

In [7]:
data

Unnamed: 0,food_id,disease_id,related_score,food,disease,disease_related_score_mean,disease_related_score_std,disease_related_score_skew,disease_related_score_kurtosis
0,food_0,disease_208,0,0,208,0.000000,0.000000,0.000000,0.000000
1,food_0,disease_1166,0,0,1166,0.000000,0.000000,0.000000,0.000000
2,food_0,disease_1418,0,0,1418,0.008621,0.160817,18.654758,348.000000
3,food_0,disease_76,0,0,76,0.103448,0.558600,5.820405,34.080231
4,food_0,disease_579,0,0,579,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...
141631,food_97,disease_685,0,97,685,0.034483,0.237580,8.637006,87.485497
141632,food_97,disease_325,0,97,325,0.347701,0.970647,2.565232,4.931675
141633,food_97,disease_911,0,97,911,0.000000,0.000000,0.000000,0.000000
141634,food_97,disease_961,0,97,961,0.540230,0.921152,1.527621,1.146513


In [8]:
data.head()
drms = data[['disease_id','disease_related_score_mean','disease_related_score_std','disease_related_score_skew','disease_related_score_kurtosis']]
drms_unique = drms.drop_duplicates(subset=['disease_id'])
drms_unique.to_csv('best_model/drms_unique.csv', index=False)

### 对三组疾病特征进行PCA降维  
维度做过一些调整，最终选用群友Baseline给出的维度

In [9]:
from sklearn.decomposition import PCA,KernelPCA,TruncatedSVD,SparsePCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
SEED = 7
def pca_fea(data, feats, n_components=10, name='tsvd', load=False):

    tsvd = Pipeline([
        ('std', MinMaxScaler()),
        # ('tsvd', TruncatedSVD(n_components=n_components, n_iter=20, random_state=SEED)),
        # ('pca', SparsePCA(n_components=n_components, max_iter=200, random_state=SEED,alpha=0.1,n_jobs=8))
        ('pca', PCA(n_components=n_components, random_state=SEED))

    ])
    tsvd.fit(data[feats])
    data_id = data['disease_id']
    deal_data = pd.DataFrame(tsvd.transform(data[feats]), columns=[f'{name}_{i}' for i in range(n_components)])
    deal_data.insert(0, 'disease_id', data['disease_id'])
    return deal_data

In [10]:
n_disease_tsvd1 = 125
feat1 = pca_fea(
    disease_feature1, 
    [item for item in disease_feature1.columns if item not in ['disease_id']], 
    n_components=n_disease_tsvd1, 
    name='disease1_pca'
)
n_disease_tsvd2 = 220
feat2 = pca_fea(
    disease_feature2, 
    [item for item in disease_feature2.columns if item not in ['disease_id']], 
    n_components=n_disease_tsvd2, 
    name='disease2_pca'
)
n_disease_tsvd3 = 130
feat3 = pca_fea(
    disease_feature3, 
    [item for item in disease_feature3.columns if item not in ['disease_id']], 
    n_components=n_disease_tsvd3, 
    name='disease3_pca'
)

In [11]:
food = train_food
# drop_rate = 0.1
# per_mis = list(food.isnull().sum() / len(food))
# df_missing = pd.DataFrame({'列名': food.columns,'缺失率': per_mis})
# df_missing.sort_values('缺失率', inplace=True,ascending=False)
# miss_fea = list(df_missing[df_missing['缺失率']<=drop_rate]['列名'])
# print(len(miss_fea))
# miss_fea

### 加载聚类模型进行聚类

In [12]:
miss_fea = ['N_197','N_198','N_33','N_211','N_82','N_111','N_165', 'N_101','N_177','N_42','N_146','N_113','N_17','N_106','N_14','N_74','N_209','N_188','food_id']

In [13]:
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler,MinMaxScaler
df_food = food[miss_fea]
df_food = df_food.drop(columns='food_id')
df_food = df_food.fillna(-1)
df_dis_1 = feat1.drop(columns='disease_id')
df_dis_2 = feat2.drop(columns='disease_id')
df_dis_3 = feat3.drop(columns='disease_id')

In [14]:
import joblib
mean = joblib.load('best_model/food_mean.pkl')
std = joblib.load('best_model/food_std.pkl')
df_food_st = (df_food - mean) / std
gmm_food = joblib.load('best_model/gmm_food_model.pkl')
food_clu = gmm_food.fit_predict(df_food_st)
food['food_clu'] = food_clu

In [15]:
gmm_d1 = joblib.load('best_model/gmm_d1_model.pkl')
d1_clu = gmm_d1.fit_predict(df_dis_1)
feat1['d1_clu'] = d1_clu

gmm_d2 = joblib.load('best_model/gmm_d2_model.pkl')
d2_clu = gmm_d2.fit_predict(df_dis_2)
feat2['d2_clu'] = d2_clu

gmm_d3 = joblib.load('best_model/gmm_d3_model.pkl')
d3_clu = gmm_d3.fit_predict(df_dis_3)
feat3['d3_clu'] = d3_clu

In [16]:
food['food_clu'].nunique()

10

### 数据进行拼接

In [17]:
data = data.merge(food, on = 'food_id', how = 'left')
data = data.merge(feat1, on = 'disease_id', how = 'left')
data = data.merge(feat2, on = 'disease_id', how = 'left')
data = data.merge(feat3, on = 'disease_id', how = 'left')
data.head()

Unnamed: 0,food_id,disease_id,related_score,food,disease,disease_related_score_mean,disease_related_score_std,disease_related_score_skew,disease_related_score_kurtosis,N_0,...,disease3_pca_121,disease3_pca_122,disease3_pca_123,disease3_pca_124,disease3_pca_125,disease3_pca_126,disease3_pca_127,disease3_pca_128,disease3_pca_129,d3_clu
0,food_0,disease_208,0,0,208,0.0,0.0,0.0,0.0,,...,-0.11926,-0.04935,0.023139,-0.113043,-0.049646,-0.030825,0.059619,0.054251,-0.027106,31.0
1,food_0,disease_1166,0,0,1166,0.0,0.0,0.0,0.0,,...,0.001666,0.087646,-0.079354,-0.012906,0.065473,-0.069966,0.001386,0.022145,0.021254,46.0
2,food_0,disease_1418,0,0,1418,0.008621,0.160817,18.654758,348.0,,...,-0.079943,-0.059698,0.025662,-0.02636,0.05046,0.074278,0.0223,0.048073,-0.031466,10.0
3,food_0,disease_76,0,0,76,0.103448,0.5586,5.820405,34.080231,,...,-0.019461,-0.016764,-0.007335,0.029523,0.055458,-0.014474,-0.098375,0.005207,0.010734,10.0
4,food_0,disease_579,0,0,579,0.0,0.0,0.0,0.0,,...,0.04763,0.014324,0.0292,0.112854,0.014293,-0.073378,-0.107103,-0.054103,0.099642,14.0


In [18]:
# data.groupby('d2_clu')['related_score'].apply(lambda x: (x > 0).sum())

### 按照食物类别进行分组，计算每个食物类别与所有疾病的相关性的四个统计量（均值、方差、偏度、峰度）

In [19]:
food_r = data.groupby('food_clu').agg({'related_score': ['mean', 'var',skew, kurtosis]}).droplevel(0, axis=1).rename(columns={'mean': 'food_r_mean', 'var': 'food_r_var'}).reset_index()
food_r.head()

Unnamed: 0,food_clu,food_r_mean,food_r_var,skew,kurtosis
0,0,0.091203,0.202963,5.727235,35.200376
1,1,0.237439,0.561195,3.361275,10.718445
2,2,0.420259,0.815991,2.193714,3.943628
3,3,0.229156,0.5443,3.394388,10.9447
4,4,0.275184,0.600687,2.928936,7.889432


In [20]:
food_r.to_csv('best_model/food_r.csv', index=False)

In [21]:
data = data.merge(food_r, on = 'food_clu', how = 'left')
data.head()

Unnamed: 0,food_id,disease_id,related_score,food,disease,disease_related_score_mean,disease_related_score_std,disease_related_score_skew,disease_related_score_kurtosis,N_0,...,disease3_pca_125,disease3_pca_126,disease3_pca_127,disease3_pca_128,disease3_pca_129,d3_clu,food_r_mean,food_r_var,skew,kurtosis
0,food_0,disease_208,0,0,208,0.0,0.0,0.0,0.0,,...,-0.049646,-0.030825,0.059619,0.054251,-0.027106,31.0,0.237439,0.561195,3.361275,10.718445
1,food_0,disease_1166,0,0,1166,0.0,0.0,0.0,0.0,,...,0.065473,-0.069966,0.001386,0.022145,0.021254,46.0,0.237439,0.561195,3.361275,10.718445
2,food_0,disease_1418,0,0,1418,0.008621,0.160817,18.654758,348.0,,...,0.05046,0.074278,0.0223,0.048073,-0.031466,10.0,0.237439,0.561195,3.361275,10.718445
3,food_0,disease_76,0,0,76,0.103448,0.5586,5.820405,34.080231,,...,0.055458,-0.014474,-0.098375,0.005207,0.010734,10.0,0.237439,0.561195,3.361275,10.718445
4,food_0,disease_579,0,0,579,0.0,0.0,0.0,0.0,,...,0.014293,-0.073378,-0.107103,-0.054103,0.099642,14.0,0.237439,0.561195,3.361275,10.718445


### 将id、标签、以及值唯一的特征名称放入列表

In [22]:
drop_cols = ['disease_id', 'food_id', 'related_score']

In [23]:
for f in data.columns:
    if data[f].nunique() < 2:
        drop_cols.append(f)

In [24]:
drop_cols

['disease_id',
 'food_id',
 'related_score',
 'N_2',
 'N_8',
 'N_12',
 'N_15',
 'N_21',
 'N_23',
 'N_24',
 'N_25',
 'N_26',
 'N_27',
 'N_29',
 'N_31',
 'N_32',
 'N_34',
 'N_36',
 'N_38',
 'N_39',
 'N_41',
 'N_65',
 'N_66',
 'N_70',
 'N_83',
 'N_103',
 'N_107',
 'N_108',
 'N_110',
 'N_117',
 'N_130',
 'N_137',
 'N_143',
 'N_144',
 'N_150',
 'N_159',
 'N_174',
 'N_186',
 'N_189']

### 缺失值补零  
尝试过其他方式，比如不做处理，补充为特殊值等等，其中效果最好为补零

In [25]:
# test_df = data[data["related"].isnull() == True].copy().reset_index(drop=True)
train_df = data[~data["related_score"].isnull() == True].copy().reset_index(drop=True)
train_df = train_df.fillna(0)
train_df.head()


Unnamed: 0,food_id,disease_id,related_score,food,disease,disease_related_score_mean,disease_related_score_std,disease_related_score_skew,disease_related_score_kurtosis,N_0,...,disease3_pca_125,disease3_pca_126,disease3_pca_127,disease3_pca_128,disease3_pca_129,d3_clu,food_r_mean,food_r_var,skew,kurtosis
0,food_0,disease_208,0,0,208,0.0,0.0,0.0,0.0,0.0,...,-0.049646,-0.030825,0.059619,0.054251,-0.027106,31.0,0.237439,0.561195,3.361275,10.718445
1,food_0,disease_1166,0,0,1166,0.0,0.0,0.0,0.0,0.0,...,0.065473,-0.069966,0.001386,0.022145,0.021254,46.0,0.237439,0.561195,3.361275,10.718445
2,food_0,disease_1418,0,0,1418,0.008621,0.160817,18.654758,348.0,0.0,...,0.05046,0.074278,0.0223,0.048073,-0.031466,10.0,0.237439,0.561195,3.361275,10.718445
3,food_0,disease_76,0,0,76,0.103448,0.5586,5.820405,34.080231,0.0,...,0.055458,-0.014474,-0.098375,0.005207,0.010734,10.0,0.237439,0.561195,3.361275,10.718445
4,food_0,disease_579,0,0,579,0.0,0.0,0.0,0.0,0.0,...,0.014293,-0.073378,-0.107103,-0.054103,0.099642,14.0,0.237439,0.561195,3.361275,10.718445


### 将标签列，相关性评级>0的转变为1，做二分类

In [26]:
train_df['related_score'] = train_df['related_score'].apply(lambda x: 1 if x > 0 else 0)


### 选择lgbm模型中特征重要性最强的五个特征进行交叉  
由于过拟合原因，去掉了食物编码特征特征，food  
并且去掉了加减的交叉，仅使用乘除

In [27]:
topn = ['N_33', 'N_198', 'N_209','N_74','N_165']
for i in range(len(topn)):
    for j in range(i + 1, len(topn)):
        # train_df[f'{topn[i]}+{topn[j]}'] = train_df[topn[i]] + train_df[topn[j]]
        # train_df[f'{topn[i]}-{topn[j]}'] = train_df[topn[i]] - train_df[topn[j]]
        train_df[f'{topn[i]}*{topn[j]}'] = train_df[topn[i]] * train_df[topn[j]]
        train_df[f'{topn[i]}/{topn[j]}'] = train_df[topn[i]] / (train_df[topn[j]]+1e-5)

In [28]:
feature_name = [f for f in train_df.columns if f not in drop_cols]
X_train = train_df[feature_name].reset_index(drop=True)
# X_test = test_df[feature_name].reset_index(drop=True)
y = train_df['related_score'].reset_index(drop=True)


In [29]:
print(len(feature_name))
print(feature_name)

685
['food', 'disease', 'disease_related_score_mean', 'disease_related_score_std', 'disease_related_score_skew', 'disease_related_score_kurtosis', 'N_0', 'N_1', 'N_3', 'N_4', 'N_5', 'N_6', 'N_7', 'N_9', 'N_10', 'N_11', 'N_13', 'N_14', 'N_16', 'N_17', 'N_18', 'N_19', 'N_20', 'N_22', 'N_28', 'N_30', 'N_33', 'N_35', 'N_37', 'N_40', 'N_42', 'N_43', 'N_44', 'N_45', 'N_46', 'N_47', 'N_48', 'N_49', 'N_50', 'N_51', 'N_52', 'N_53', 'N_54', 'N_55', 'N_56', 'N_57', 'N_58', 'N_59', 'N_60', 'N_61', 'N_62', 'N_63', 'N_64', 'N_67', 'N_68', 'N_69', 'N_71', 'N_72', 'N_73', 'N_74', 'N_75', 'N_76', 'N_77', 'N_78', 'N_79', 'N_80', 'N_81', 'N_82', 'N_84', 'N_85', 'N_86', 'N_87', 'N_88', 'N_89', 'N_90', 'N_91', 'N_92', 'N_93', 'N_94', 'N_95', 'N_96', 'N_97', 'N_98', 'N_99', 'N_100', 'N_101', 'N_102', 'N_104', 'N_105', 'N_106', 'N_109', 'N_111', 'N_112', 'N_113', 'N_114', 'N_115', 'N_116', 'N_118', 'N_119', 'N_120', 'N_121', 'N_122', 'N_123', 'N_124', 'N_125', 'N_126', 'N_127', 'N_128', 'N_129', 'N_131', 'N_

### 模型训练

In [30]:
train_pred = {}
# test_pred = {}
seeds = [222222222]
num_model_seed = 1
oof = np.zeros(X_train.shape[0])
# prediction = np.zeros(X_test.shape[0])
feat_imp_df = pd.DataFrame({'feats': feature_name, 'imp': 0})
parameters = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 64,
    'lambda_l1': 0.5,
    'lambda_l2': 0.5,
    'feature_fraction': 0.7,
    'bagging_fraction': 0.7,
    'bagging_freq': 5,
    'seed': 222222,
    'bagging_seed': 222222,
    'feature_fraction_seed': 222222,
    'min_data_in_leaf': 20,
    'verbose': -1, 
    'n_jobs':8
}
models_list = []
fold = 5
for model_seed in range(num_model_seed):
    print(seeds[model_seed],"--------------------------------------------------------------------------------------------")
    oof_cat = np.zeros(X_train.shape[0])
    # prediction_cat = np.zeros(X_test.shape[0])
    skf = StratifiedKFold(n_splits=fold, random_state=seeds[model_seed], shuffle=True)
    for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
        train_x, test_x, train_y, test_y = X_train[feature_name].iloc[train_index], X_train[feature_name].iloc[test_index], y.iloc[train_index], y.iloc[test_index]
        dtrain = lgb.Dataset(train_x, label=train_y)
        dval = lgb.Dataset(test_x, label=test_y)
        lgb_model = lgb.train(
            parameters,
            dtrain,
            num_boost_round=15000,
            valid_sets=[dval],
            early_stopping_rounds=100,
            verbose_eval=100 )
        models_list.append(lgb_model)
        filename = f'best_model/model_fold_{index}.pkl'
        lgb_model.save_model(filename)
        oof_cat[test_index] += lgb_model.predict(test_x,num_iteration=lgb_model.best_iteration)
        # prediction_cat += lgb_model.predict(X_test,num_iteration=lgb_model.best_iteration) / fold
        feat_imp_df['imp'] += lgb_model.feature_importance()

        del train_x
        del test_x
        del train_y
        del test_y
        del lgb_model
    oof += oof_cat / num_model_seed
    # prediction += prediction_cat / num_model_seed
gc.collect()

222222222 --------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.954224
[200]	valid_0's auc: 0.961501
[300]	valid_0's auc: 0.965942
[400]	valid_0's auc: 0.969029
[500]	valid_0's auc: 0.971326
[600]	valid_0's auc: 0.972875
[700]	valid_0's auc: 0.973914
[800]	valid_0's auc: 0.974856
[900]	valid_0's auc: 0.975606
[1000]	valid_0's auc: 0.976259
[1100]	valid_0's auc: 0.976853
[1200]	valid_0's auc: 0.977413
[1300]	valid_0's auc: 0.977899
[1400]	valid_0's auc: 0.978325
[1500]	valid_0's auc: 0.978703
[1600]	valid_0's auc: 0.979024
[1700]	valid_0's auc: 0.979377
[1800]	valid_0's auc: 0.979624
[1900]	valid_0's auc: 0.979849
[2000]	valid_0's auc: 0.98012
[2100]	valid_0's auc: 0.980398
[2200]	valid_0's auc: 0.980609
[2300]	valid_0's auc: 0.980729
[2400]	valid_0's auc: 0.980897
[2500]	valid_0's auc: 0.981038
[2600]	valid_0's auc: 0.981187
[2700]	valid_0's auc: 0.981242
[2800]	v

702

In [31]:
y_r = train_answer['related']

In [32]:
y_r = [1 if x >= 1 else 0 for x in y ]
sum(y_r)/len(y_r)

0.09947329774915982

In [33]:
len(y_r)

141636

In [34]:
import sklearn 
y_pred = (oof >= 0.5).astype(int)
auc_value = sklearn.metrics.roc_auc_score(y_r, oof)
f1_vlaue = sklearn.metrics.f1_score(y_r, y_pred)


print('total_score:', (auc_value+f1_vlaue)/2)
print("auc_score:", auc_value)
print("f1_score:", f1_vlaue)

total_score: 0.9001269980980433
auc_score: 0.9817119688831416
f1_score: 0.8185420273129449



total_score: 0.9001269980980433  
auc_score: 0.9817119688831416  
f1_score: 0.8185420273129449