In [21]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn import PermutationImportance
from eli5 import explain_prediction
import shap

In [22]:
#数据预处理
def dataprocessing(data):
    dummies_Sex = pd.get_dummies(data['Sex'], prefix='Sex')
    dummies_Sex = pd.DataFrame(dummies_Sex)
    
    ECOG = {'ECOG 0': 0, 'ECOG 1': 1, 'ECOG 2': 2, 'ECOG 3': 3, 'ECOG 4': 4}
    data["ECOG_Performance"] = data["ECOG_Performance"].map(ECOG)
    
    smoker = {'Non-smoker': 0, 'Ex-smoker': 1, 'Current': 2}
    data["Smoking"] = data["Smoking"].map(smoker)
    
    drinker = {'non-drinker': 0, 'light': 1, 'Moderate': 2, 'ex-drinker': 3, 'heavy': 4, 'unknown': 5}
    data["Drinking"] = data["Drinking"].map(drinker)
    
    sub = {'post wall': 1, 'Tonsillar Fossa': 2, 'Base of Tongue': 3, 'Tonsil ': 4,
           'Tonsil Pillar ': 5, 'Soft Palate ': 6, 'Vallecula': 7, 'lat wall': 0}
    data["Subsite"] = data["Subsite"].map(sub)
    
    T = {'T1': 1, 'T2': 2, 'T3': 3, 'T3 (2) ': 4, 'T4a': 5, 'T4b': 0}
    data["T"] = data["T"].map(T)
    
    N = {'N0': 0, 'N1': 1, 'N2a': 2, 'N2b': 3, 'N2c': 4, 'N3': 5}
    data["N"] = data["N"].map(N)
    
    M = {'M0': 0}
    data["M"] = data["M"].map(M)
    
    stage = {'I': 0, 'II': 1, 'III': 2, 'IVA': 3, 'IVB': 4}
    data["Stage"] = data["Stage"].map(stage)
    
    Hpv = {'  Negative': 0, '  positive': 1}
    data["HPVp16status"] = data["HPVp16status"].map(Hpv)
    
    Che = {'none': 0, 'Yes': 1}
    data["Chemotherapy"] = data["Chemotherapy"].map(Che)
    
    RT = {'IMRT': 0, 'IMRT-ipsilat': 1}
    data["RT_Tech"] = data["RT_Tech"].map(RT)
    
    Status = {'Dead': 1, 'Alive': 0}
    data["Status"] = data["Status"].map(Status)
    
    data.drop(['PatientID', 'Sex','Ds_Site', 'Path', 'Primary_Treatment', 'Cause_of_Death',
               'Local_Failure', 'local_failure(days)', 'Regional_Failure', 'regional_failure(days)',
               'Distant_Failure', 'distant_failure (days)', 'Second_Primary', '2nd_cancer_(days)',
               'PMID', 'diagnostics_Versions_PyRadiomics', 'diagnostics_Versions_Numpy',
               'diagnostics_Versions_SimpleITK', 'diagnostics_Versions_PyWavelet', 'diagnostics_Versions_Python',
               'diagnostics_Configuration_Settings', 'diagnostics_Configuration_EnabledImageTypes',
               'diagnostics_Image-original_Hash', 'diagnostics_Image-original_Dimensionality',
               'diagnostics_Image-original_Spacing', 'diagnostics_Image-original_Size',
               'diagnostics_Image-original_Mean',
               'diagnostics_Image-original_Minimum', 'diagnostics_Image-original_Maximum',
               'diagnostics_Mask-original_Hash', 'diagnostics_Mask-original_Spacing',
               'diagnostics_Mask-original_Size',
               'diagnostics_Mask-original_BoundingBox', 'diagnostics_Mask-original_VoxelNum',
               'diagnostics_Mask-original_VolumeNum', 'diagnostics_Mask-original_CenterOfMassIndex',
               'diagnostics_Mask-original_CenterOfMass'

               ], axis=1, inplace=True)
    
    data_df=pd.concat([data,dummies_Sex],axis=1)
    return data_df

In [23]:
#标准化
def standerscaler(df):
    X = df.drop(['FU', 'Status'], axis=1)
    y = df.pop('Status').values
    z = df.pop('FU').values

    # print(X_status)
    # print(y_status)
    colNames = X.columns
    X = X.astype(np.float64)
    y=y.astype(np.float64)
    z=z.astype(np.float64)

    X = StandardScaler().fit_transform(X)
    X = pd.DataFrame(X)
    X.columns = colNames
    return X, y, z

In [24]:
train_data = 'train_data.csv'
data_train = pd.read_csv(train_data)
#print(data_train)
dataprocessing(data_train)
# print("------------------------------------------------")
df = data_train.fillna(0)
print("----------------------------------------")
X_status, y_status, z_status = standerscaler(df)
X_fu = X_status
z_fu = z_status
X_train_lifetime, X_test_lifetime, y_train_lifetime, y_test_lifetime = train_test_split(X_status, z_status, test_size=0.3)

----------------------------------------


Columns (27) have mixed types.Specify dtype option on import or set low_memory=False.


In [25]:
#训练XGboost
model_xgb = xgb.XGBRegressor(max_depth=4, learning_rate=0.05, n_estimators=150)
model_xgb.fit(X_train_lifetime, y_train_lifetime)
print(model_xgb.predict(X_test_lifetime))

[2217.0686  2269.0115  2218.5657   817.3191  1937.7062  2699.697
 2593.8884  3139.3926  2599.6743  1167.7458  1069.9774  1583.7584
 2709.2805  2853.7375  2158.0789  1761.2373  2811.3232  2237.283
 1053.7239  2499.7664  1663.391    526.71204 2825.4268  1718.6898
 2492.7686  1902.2019  2342.134    744.08075 1708.5488  2729.2703
 1775.1615  1122.2705  1781.4652   905.49    1773.7832  2295.0176
 2366.7727  1349.8007  1584.3988  2429.127   1689.5358  1348.2145
 2561.652   2120.5208  1245.8477  2240.2117  1299.9618  1826.0444
 2372.381   1957.8806  2362.117   2241.7854  2286.6367  1866.3285
 1290.9694  1970.2616  2671.3699  2231.6409  2349.0361  1926.1965
 1597.6115  1814.8529  1531.5183  1338.407   2398.5122  1886.2268
 1735.0256  1863.1796  2107.4524  1500.5007  2785.986   2302.0322
 2205.08    2248.8708  2556.1267  2212.5217  1831.5461  2426.5088
 1492.066   1791.3877  1448.5591  1235.7523  1647.877   1480.996
 2322.015   1899.1892  2045.1909  2277.929   1286.3276  1609.0219
 2081.975   2

In [None]:
#置换特征重要性
perm = PermutationImportance(model_xgb, random_state=1).fit(X_test_lifetime, y_test_lifetime)
eli5.show_weights(perm, feature_names=X_test_lifetime.columns.tolist(),top=1600)