# 训练全量数据高斯聚类  


In [1]:
import pandas as pd
import os
import gc
import lightgbm as lgb
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
import joblib

warnings.filterwarnings('ignore')

### 导入数据，食物特征数据为训练数据+初赛AB榜单提供的测试数据

In [2]:
disease_feature1 = pd.read_csv("/home/mw/input/data8766/训练集/训练集/disease_feature1.csv")
disease_feature2 = pd.read_csv("/home/mw/input/data8766/训练集/训练集/disease_feature2.csv")
disease_feature3 = pd.read_csv("/home/mw/input/data8766/训练集/训练集/disease_feature3.csv")

train_answer = pd.read_csv("/home/mw/input/data8766/训练集/训练集/train_answer.csv")
train_food = pd.read_csv("/home/mw/input/data8766/训练集/训练集/train_food.csv")

preliminary_a_food = pd.read_csv("/home/mw/input/data9986/preliminary_a_food.csv")
preliminary_b_food = pd.read_csv("/home/mw/input/data9986/preliminary_b_food.csv")


In [3]:
food = pd.concat([train_food, preliminary_a_food], axis = 0).reset_index(drop=True)
food = pd.concat([food, preliminary_b_food], axis = 0).reset_index(drop=True)

food.head()

Unnamed: 0,food_id,N_0,N_1,N_2,N_3,N_4,N_5,N_6,N_7,N_8,...,N_202,N_203,N_204,N_205,N_206,N_207,N_208,N_209,N_210,N_211
0,food_0,,,,,0.0,,,,,...,,,0.02,0.0,,,30.5,92.82,,0.92
1,food_1,,,,,0.0,,,,,...,,,23.9,0.0,,,0.0,2.41,,3.31
2,food_4,,,,,0.0,,,,,...,,,0.12,0.0,,,3.5,15.46,,0.36
3,food_5,,,,0.068,0.0,0.045,0.75,0.314,,...,,,0.89,0.0,,,3.3,86.35,,0.2
4,food_6,,,,0.115,0.0,0.091,0.58,0.508,,...,,,1.13,0.0,0.0,,41.6,93.22,,0.54


In [4]:
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler,MinMaxScaler

### 获取缺失率小于10%的食物特征的名称

In [5]:
drop_rate = 0.1
per_mis = list(food.isnull().sum() / len(food))
df_missing = pd.DataFrame({'列名': food.columns,'缺失率': per_mis})
df_missing.sort_values('缺失率', inplace=True,ascending=False)
miss_fea = list(df_missing[df_missing['缺失率']<=drop_rate]['列名'])
print(len(miss_fea))
miss_fea

19


['N_197',
 'N_198',
 'N_33',
 'N_211',
 'N_82',
 'N_111',
 'N_165',
 'N_101',
 'N_177',
 'N_42',
 'N_146',
 'N_113',
 'N_17',
 'N_106',
 'N_14',
 'N_74',
 'N_209',
 'N_188',
 'food_id']

In [6]:
# miss_fea = ['N_197','N_198','N_33','N_211','N_82','N_111','N_165', 'N_101','N_177','N_42','N_146','N_113','N_17','N_106','N_14','N_74','N_209','N_188','food_id']


### 上述18个食物特征的缺失值填补为-1

In [7]:
df_food = food[miss_fea]
df_food = df_food.drop(columns='food_id')
df_food = df_food.fillna(-1)

### 定义用轮廓系数确定最佳聚类效果的函数

In [8]:
def k_s(df,a,b):
    K_list = []
    score_list = []

    for n in range(a, b):
        gmm = GaussianMixture(n_components=n, covariance_type='full',random_state= 7)
        gmm.fit(df)
        labels = gmm.fit_predict(df)
        K_list.append(n)
        for i in range(df.shape[0]):
            s = silhouette_score(df, labels, metric='euclidean')
        score_list.append(s)
    ind_s = score_list.index(max(score_list))
    print(K_list)
    print(score_list)
    return K_list[ind_s],max(score_list)

### 搜索食物的最佳聚类数目，并保存标准化的均值和方差  


In [9]:
from sklearn.preprocessing import StandardScaler,MinMaxScaler

scaler = StandardScaler()
df_food_st = scaler.fit_transform(df_food)
joblib.dump(scaler.mean_, 'best_model/food_mean.pkl')
joblib.dump(scaler.scale_, 'best_model/food_std.pkl')
k4,s4 = k_s(df_food_st,2,15)
print('K值为：{}'.format(k4),'最大轮廓系数为：{}'.format(s4))

[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
[0.23312555683028172, 0.2362965416429136, 0.1821664482670246, 0.21215502275883966, 0.2193289657499333, 0.23186873967378924, 0.23272314966824476, 0.2094420546701629, 0.2554301842025586, 0.25146161824625685, 0.23831526985090087, 0.24549180246118912, 0.24447660166876167]
K值为：10 最大轮廓系数为：0.2554301842025586


### 高斯聚类并保存模型

In [10]:
gmm = GaussianMixture(n_components=k4, covariance_type='full',random_state= 7)
gmm.fit(df_food_st)
joblib.dump(gmm, 'best_model/gmm_food_model.pkl')
labels = gmm.fit_predict(df_food_st)


In [11]:
food_clu = labels
food['food_clu'] = food_clu

In [12]:
food['food_clu'].unique()

array([3, 2, 4, 1, 5, 0, 6, 8, 7, 9])

### 对三种疾病特征进行PCA降维

In [13]:
from sklearn.decomposition import PCA,KernelPCA,TruncatedSVD,SparsePCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,MinMaxScaler
SEED = 7
def pca_fea(data, feats, n_components=10, name='tsvd', load=False):

    tsvd = Pipeline([
        ('std', MinMaxScaler()),
        # ('tsvd', TruncatedSVD(n_components=n_components, n_iter=20, random_state=SEED)),
        # ('pca', SparsePCA(n_components=n_components, max_iter=200, random_state=SEED,alpha=0.1,n_jobs=8))
        ('pca', PCA(n_components=n_components, random_state=SEED))

    ])
    tsvd.fit(data[feats])
    data_id = data['disease_id']
    deal_data = pd.DataFrame(tsvd.transform(data[feats]), columns=[f'{name}_{i}' for i in range(n_components)])
    deal_data.insert(0, 'disease_id', data['disease_id'])
    return deal_data

In [14]:
n_disease_tsvd1 = 125
feat1 = pca_fea(
    disease_feature1, 
    [item for item in disease_feature1.columns if item not in ['disease_id']], 
    n_components=n_disease_tsvd1, 
    name='disease1_pca'
)

n_disease_tsvd2 = 220
feat2 = pca_fea(
    disease_feature2, 
    [item for item in disease_feature2.columns if item not in ['disease_id']], 
    n_components=n_disease_tsvd2, 
    name='disease2_pca'
)
n_disease_tsvd3 = 130
feat3 = pca_fea(
    disease_feature3, 
    [item for item in disease_feature3.columns if item not in ['disease_id']], 
    n_components=n_disease_tsvd3, 
    name='disease3_pca'
)

In [15]:
df_dis_1 = feat1.drop(columns='disease_id')
df_dis_2 = feat2.drop(columns='disease_id')
df_dis_3 = feat3.drop(columns='disease_id')

### 搜索三个疾病特征集的最佳K值

In [16]:
print('搜索疾病特征第一类的K值')
k1,s1 = k_s(df_dis_1,2,4)
print('K值为：{}'.format(k1),'最大轮廓系数为：{}'.format(s1))
print('搜索疾病特征第二类的K值')
k2,s2 = k_s(df_dis_2,2,4)
print('K值为：{}'.format(k2),'最大轮廓系数为：{}'.format(s2))
print('搜索疾病特征第三类的K值')
k3,s3 = k_s(df_dis_3,52,60)
print('K值为：{}'.format(k3),'最大轮廓系数为：{}'.format(s3))

搜索疾病特征第一类的K值
[2, 3]
[0.6728047237088757, 0.6111912271642198]
K值为：2 最大轮廓系数为：0.6728047237088757
搜索疾病特征第二类的K值
[2, 3]
[0.670575887290354, 0.6495452074998634]
K值为：2 最大轮廓系数为：0.670575887290354
搜索疾病特征第三类的K值
[52, 53, 54, 55, 56, 57, 58, 59]
[0.24875909148809552, 0.2467493101371044, 0.24959745942587758, 0.2668225302211355, 0.2665954374842014, 0.266778375202422, 0.26371941615295186, 0.26575457757033455]
K值为：55 最大轮廓系数为：0.2668225302211355


### 聚类并保存模型

In [17]:
gmm_d1 = GaussianMixture(n_components=k1, covariance_type='full',random_state= 7)
gmm_d1.fit(df_dis_1)
joblib.dump(gmm_d1, 'best_model/gmm_d1_model.pkl')

['best_model/gmm_d1_model.pkl']

In [18]:
gmm_d2 = GaussianMixture(n_components=k2, covariance_type='full',random_state= 7)
gmm_d2.fit(df_dis_2)
joblib.dump(gmm_d2, 'best_model/gmm_d2_model.pkl')

['best_model/gmm_d2_model.pkl']

In [19]:
gmm_d3 = GaussianMixture(n_components=k3, covariance_type='full',random_state= 7)
gmm_d3.fit(df_dis_3)
joblib.dump(gmm_d3, 'best_model/gmm_d3_model.pkl')

['best_model/gmm_d3_model.pkl']