In [None]:
import sys
from common import commons
home = commons.home
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
from features_selection import Feature_Selection as FS
from log import Logger
from hyperopt import fmin,tpe,hp, STATUS_OK,Trials
import math
from features_selection import  WilcoxonRankSums
from sklearn.externals import joblib
from features_selection import feature_selection_commons as fsc

In [3]:
dataset = 'AD_CpG'
type_name = commons.type_name  ## amyloid, cerad, tangles
with_cell_type = commons.with_cell_type ## with or without
dataset = dataset+'/'+type_name+with_cell_type
log_dir = home+'logs/'
logger = Logger.Logger(log_dir,False).get_logger()
with pd.HDFStore(home+'data/'+dataset+'/all_features','r') as h5s:
    all_data = h5s['all_features']
all_data['beta_sign'] = all_data['label']
#all_data['coordinate'] = all_data['coordinate'].astype('i8')
all_data.drop(['coordinate','chr'],axis=1,inplace=True)
all_data['dist_to_nearest_tss'] = all_data['dist_to_nearest_tss'].astype('i8')
all_data = fsc.data_selection(all_data,classes=[0,1,-1],combine=True)
#all_data = fsc.data_selection(all_data,classes=[0,1,-1],combine=True)
all_features = all_data
#all_features = fsc.subset_control(all_data,30)
#all_features = all_data.query('beta_sign>0') ##only for hypermethylated sites in RICHS dataset, for AD dataset, hyper/hypo status can't be determined from beta
#logger.info('only keep heypermethylated sites')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [4]:
if type_name == 'cerad':
    type_weight_factor = 0.23
elif type_name == 'amyloid':
    type_weight_factor = 0.3
elif type_name == 'cogdec':
    type_weight_factor = 0.4
elif type_name == 'gpath':
    type_weight_factor = 0.3
elif type_name == 'braak':
    type_weight_factor = 0.3
elif type_name == 'tangles':
    type_weight_factor = 0.3
else:
    type_weight_factor = 0.3

In [5]:
#split train test data and scaling on train data
scaler_type='MinMax'
all_features.drop(['id','winid','beta','beta_sign'],axis=1,inplace=True)
train_x,train_label,test_x,test_label,_ = commons.train_test_split(all_features,scaler=scaler_type)
train_x.reset_index(drop=True,inplace=True)
train_label.reset_index(drop=True,inplace=True)
test_x.reset_index(drop=True,inplace=True)
test_label.reset_index(drop=True,inplace=True)

In [6]:
sample_weights_train = commons.sample_weights(train_x,train_label,factor=1)
sample_weights_test = commons.sample_weights(test_x,test_label,factor=1)
weight_min_max_ratio = sample_weights_train.max()/sample_weights_train.min()
logger.info('weight max ratio: %f',weight_min_max_ratio)

In [7]:
weight_min_max_ratio

36602.50799176876

In [8]:
train_x.drop(['pvalue'],axis=1,inplace=True)
test_x.drop(['pvalue'],axis=1,inplace=True)

In [9]:
fs_sample_weights = np.power(sample_weights_train, type_weight_factor) 

In [10]:
fs_sample_weights.max()/fs_sample_weights.min()

23.39124044730199

In [11]:
methods = ['random_forest','xgboost','logistic_regression','linear_SVC']
all_intersect = False
fs_params = fsc.method_params(methods)
fs = FS.FeatureSelection(class_num=2,methods=methods,all_intersect=all_intersect,**fs_params)
logger.info('Feature selection methods are: '+str(methods))
logger.info('All intersected features: '+str(all_intersect))
fs.fit(sample_weight=fs_sample_weights)
selected_features = fs.transform(train_x,train_label)
logger.info('selected features number is: %d\n',selected_features.shape[0])
logger.info(selected_features)
reduced_train_x = train_x[selected_features['feature']]
reduced_test_x = test_x[selected_features['feature']]
total_x = pd.concat([reduced_train_x,reduced_test_x],ignore_index=True)
total_label = pd.concat([train_label,test_label],ignore_index=True)
total_weights = pd.concat([sample_weights_train,sample_weights_test],ignore_index=True)

feature_diff_stats = fsc.selected_feature_analysis(selected_features['feature'],total_x,total_label)
feature_diff_stats = pd.merge(feature_diff_stats,selected_features,on='feature')
selected_features_100 = feature_diff_stats if len(feature_diff_stats) <=50 else feature_diff_stats.sort_values(['n','pvalue'],ascending=[False,True])[:60]
reduced_train_x = train_x[selected_features_100['feature']]
reduced_test_x = test_x[selected_features_100['feature']]
total_x = pd.concat([reduced_train_x,reduced_test_x],ignore_index=True)
total_label = pd.concat([train_label,test_label],ignore_index=True)
total_weights = pd.concat([sample_weights_train,sample_weights_test],ignore_index=True)

[[ 0.          0.         -0.09924293 ...  0.          0.
   0.        ]]


In [12]:
selected_features_100

Unnamed: 0,diff(pos-neg),feature,pvalue,stats,n
269,-0.074488,Medullo,3.348335e-10,-6.281727,4.0
271,0.02571,iPS_CWRU1,3.552141e-09,5.903804,4.0
274,0.045386,Fetal_Muscle_Trunk-H3K4me1,6.096477e-08,5.415948,4.0
267,-0.091538,NCFF730NQT_WGBS_counts,2.798484e-06,-4.685082,4.0
272,0.016648,Brain_Cingulate_Gyrus-H3K4me1,5.190335e-06,4.556944,4.0
277,0.027953,Brain_Mid_Frontal_Lobe-H3K4me1,1.363633e-05,4.349637,4.0
278,0.140283,genocanyon_score,3.416859e-05,4.143741,4.0
268,-0.077227,ENCFF733EFJ_WGBS_counts,0.0001036776,-3.88182,4.0
270,0.034462,Monocytes-CD14+_RO01746-H3K4me1,0.0007560002,3.368421,4.0
276,0.039544,CD19_Primary_Cells-H3K4me1,0.001041419,3.279091,4.0


In [7]:
selected_features_100 = pd.read_csv(home+'data/'+dataset+'/feature_stats.csv')

In [8]:
_,_,_,_,scaler = commons.train_test_split(all_features[['label','pvalue']+list(selected_features_100['feature'])],scaler=scaler_type)
joblib.dump(scaler,home+'data/'+dataset+'/scaler.pkl')
print('Data scaler type is: %s'%scaler_type)

Data scaler type is: MinMax


In [33]:
selected_features_100

Unnamed: 0.1,Unnamed: 0,diff(pos-neg),feature,pvalue,stats,n
0,346,0.034585,Penis_Foreskin_Fibroblast_Primary_Cells-H3K36me3,1.273059e-07,5.282683,4.0
1,342,0.032203,Skeletal_Muscle-H3K4me1,4.644749e-06,4.580225,4.0
2,344,0.039454,Dnd41-H4K20me1,9.063674e-06,4.438384,4.0
3,348,0.067274,Penis_Foreskin_Keratinocyte_Primary_Cells-H3K4me1,2.514063e-05,4.213533,4.0
4,340,0.112865,genocanyon_score,4.173064e-05,4.097683,4.0
5,345,0.054581,Spleen-H3K4me1,5.232399e-05,4.044996,4.0
6,349,0.034248,Rectal_Mucosa-H3K4me1,6.228165e-05,4.003995,4.0
7,347,0.062983,NCFF692JTJ_WGBS_counts,0.01429415,2.449785,4.0
8,339,0.016642,Psoas_Muscle-H3K27ac,0.0188318,2.348844,4.0
9,343,0.011695,MCF-7-PML,0.02524536,2.237627,4.0


In [14]:
selected_features_100.to_csv(home+'data/'+dataset+'/feature_stats.csv')

In [None]:
#TSNEPlot(plot_data,class_labels=[0,1,2],param_map={0:('g','^',20),1:('b','*',20),2:('r','o',20)})
plot_data = reduced_train_x.copy()
plot_data['label'] = train_label
fsc.TSNEPlot(plot_data,class_labels=[0,1],param_map={0:('k','^',20),1:('r','*',20)})

In [15]:
with pd.HDFStore(home+'data/'+dataset+'/selected_features','w') as h5s:
    h5s['train_x'] = reduced_train_x
    h5s['train_label'] = train_label
    h5s['test_x'] = reduced_test_x
    h5s['test_label'] = test_label
    h5s['sample_weights_train'] = sample_weights_train
    h5s['sample_weights_test'] = sample_weights_test