In [1]:
import pandas as pd
from numpy import loadtxt
import xgboost as xgb
from matplotlib import pyplot

In [2]:
# load data
fn = "LFS_clean.tsv"
df = pd.read_csv(fn,sep="\t")
df.head()

Unnamed: 0,Family_code,Country,Population,Region,Development,Class,Generations_analyzed,WT_nucleotide,Mutant_nucleotide,WT_codon,...,Generation,Sex,Germline_carrier,Mode_of_inheritance,Dead,Age,Age_at_diagnosis,Ref_ID,Short_topo_combined,Short_topo_combined_count
0,Bard94,USA,Northern America,Americas,More developed regions,LFL,2.0,A,G,ATG,...,3.0,F,Confirmed,M,False,,6.0,1,HEMATOP. SYSTEM,1
1,Bard94,USA,Northern America,Americas,More developed regions,LFL,2.0,A,G,ATG,...,2.0,F,Confirmed,,True,35.0,30.0,1,CERVIX UTERI,1
2,Bard94,USA,Northern America,Americas,More developed regions,LFL,2.0,A,G,ATG,...,3.0,F,,,True,,1.0,1,ADRENAL GLAND,1
3,Bir16,UK,Northern Europe,Europe,More developed regions,LFS,2.0,A,G,TAT,...,1.0,F,Confirmed,,True,27.0,27.0,2,BREAST,1
4,Bir16,UK,Northern Europe,Europe,More developed regions,LFS,2.0,A,G,TAT,...,2.0,F,Confirmed,M,True,,1.0,2,ADRENAL GLAND,1


In [3]:
df.describe()

Unnamed: 0,Generations_analyzed,REVEL,BayesDel,COSMIClink,CLINVARlink,cBioportalCount,Generation,Age,Age_at_diagnosis,Ref_ID,Short_topo_combined_count
count,1942.0,1542.0,1542.0,1946.0,1702.0,2192.0,1785.0,623.0,2192.0,2192.0,2192.0
mean,2.942327,0.843647,0.425752,73795.67,131490.653937,81.620894,2.173669,33.995185,29.247719,174.964872,1.019161
std,1.141133,0.11878,0.172896,504724.1,141030.220743,131.813933,0.95003,19.546259,20.102686,110.555628,0.137121
min,1.0,0.272,-0.2018,6482.0,12347.0,0.0,1.0,1.0,0.0,1.0,1.0
25%,2.0,0.754,0.2624,10663.0,12374.0,3.0,1.0,18.0,12.0,86.0,1.0
50%,3.0,0.896,0.5174,11089.0,127816.0,12.0,2.0,33.0,29.0,153.0,1.0
75%,4.0,0.927,0.5536,43882.0,182970.0,140.0,3.0,47.0,42.0,265.0,1.0
max,6.0,0.981,0.6071,6904034.0,528263.0,493.0,5.0,92.0,91.0,384.0,2.0


In [4]:
# Number of each type of column
df.dtypes.value_counts()

object     32
float64     8
int64       3
bool        1
dtype: int64

In [5]:
# Number of unique classes in each object column
df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

Family_code              896
Country                   44
Population                16
Region                     6
Development                2
Class                      6
WT_nucleotide              7
Mutant_nucleotide          4
WT_codon                  54
Mutant_codon              57
CpG_site                   2
Splice_site                5
Context_coding_3          17
WT_AA                     21
Mutant_AA                 21
Effect                     8
AGVGDClass                 7
SIFTClass                  2
Polyphen2                  3
TransactivationClass       4
DNEclass                   3
ProtDescription          244
Hotspot                    2
Domain_function            9
Residue_function          15
Individual_code         2192
FamilyCase                50
FamilyCase_group           6
Sex                        2
Germline_carrier           4
Mode_of_inheritance        4
Short_topo_combined       87
dtype: int64

In [6]:
# Remove individul code
df = df.drop(['Individual_code'],axis=1)
df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

Family_code             896
Country                  44
Population               16
Region                    6
Development               2
Class                     6
WT_nucleotide             7
Mutant_nucleotide         4
WT_codon                 54
Mutant_codon             57
CpG_site                  2
Splice_site               5
Context_coding_3         17
WT_AA                    21
Mutant_AA                21
Effect                    8
AGVGDClass                7
SIFTClass                 2
Polyphen2                 3
TransactivationClass      4
DNEclass                  3
ProtDescription         244
Hotspot                   2
Domain_function           9
Residue_function         15
FamilyCase               50
FamilyCase_group          6
Sex                       2
Germline_carrier          4
Mode_of_inheritance       4
Short_topo_combined      87
dtype: int64

In [7]:
# Find correlations with the target and sort
correlations = df.corr()['Age_at_diagnosis'].sort_values()

# Display correlations
print('Most Positive Correlations:\n', correlations.tail(15))
print('\nMost Negative Correlations:\n', correlations.head(15))


Most Positive Correlations:
 Generation                  -0.311825
REVEL                       -0.100563
cBioportalCount             -0.093932
BayesDel                    -0.086488
Short_topo_combined_count   -0.059012
Ref_ID                       0.003699
COSMIClink                   0.055529
Dead                         0.057207
CLINVARlink                  0.060734
Generations_analyzed         0.226798
Age                          0.957766
Age_at_diagnosis             1.000000
Name: Age_at_diagnosis, dtype: float64

Most Negative Correlations:
 Generation                  -0.311825
REVEL                       -0.100563
cBioportalCount             -0.093932
BayesDel                    -0.086488
Short_topo_combined_count   -0.059012
Ref_ID                       0.003699
COSMIClink                   0.055529
Dead                         0.057207
CLINVARlink                  0.060734
Generations_analyzed         0.226798
Age                          0.957766
Age_at_diagnosis            

In [8]:
# caluclate the % of missing value
df.isnull().sum()/ len(df)

Family_code                  0.000000
Country                      0.017792
Population                   0.017792
Region                       0.017792
Development                  0.017792
Class                        0.095803
Generations_analyzed         0.114051
WT_nucleotide                0.000000
Mutant_nucleotide            0.135493
WT_codon                     0.098084
Mutant_codon                 0.186131
CpG_site                     0.000000
Splice_site                  0.000000
Context_coding_3             0.000000
WT_AA                        0.098084
Mutant_AA                    0.186131
Effect                       0.002281
AGVGDClass                   0.294708
SIFTClass                    0.270073
Polyphen2                    0.296533
REVEL                        0.296533
BayesDel                     0.296533
TransactivationClass         0.296533
DNEclass                     0.391423
ProtDescription              0.000000
COSMIClink                   0.112226
CLINVARlink 

In [9]:
# one-hot encoding of categorical variables
df_onehot = pd.get_dummies(df)
print(df.shape, df_onehot.shape)

(2192, 43) (2192, 1620)


In [12]:
# imputer for handling missing values
from sklearn.preprocessing import Imputer
fill_NaN = Imputer(strategy = 'median', axis=1)
imputed_df = pd.DataFrame(fill_NaN.fit_transform(df_onehot))
imputed_df.columns = df_onehot.columns
imputed_df.index = df_onehot.index


Generations_analyzed                             0.0
REVEL                                            0.0
BayesDel                                         0.0
COSMIClink                                       0.0
CLINVARlink                                      0.0
cBioportalCount                                  0.0
Generation                                       0.0
Dead                                             0.0
Age                                              0.0
Age_at_diagnosis                                 0.0
Ref_ID                                           0.0
Short_topo_combined_count                        0.0
Family_code_AGA08-1                              0.0
Family_code_AKO16                                0.0
Family_code_AND17-12                             0.0
Family_code_AND17-25                             0.0
Family_code_AND17-28                             0.0
Family_code_AND17-35                             0.0
Family_code_AND17-36                          

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Make the random forest classifier
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
# Train on the training data
random_forest.fit(imputed_df.loc[:, imputed_df.columns != 'Age_at_diagnosis'],imputed_df.loc[:, 'Age_at_diagnosis'])


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.6s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=50, verbose=1, warm_start=False)

In [16]:
# Extract feature importances
feature_importance_values = random_forest.feature_importances_
feature_importance_values

array([1.98979071e-02, 8.97602384e-03, 9.70905910e-03, ...,
       2.32996246e-04, 2.05647247e-03, 4.55932968e-05])