In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score,f1_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Imputer
from sklearn.externals import joblib
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import lightgbm as lgb

DROP_COL = ['chromStart','chromEnd','id','chrom-Enh','TSS','label']

  from numpy.core.umath_tests import inner1d


In [3]:
def load_data(features):
	"""
	load complete feature and lables, then split the data to test and train

	@features: complete features in pandas DataFrame format
	@label: 5D vector, the first column is its id

	return: train and test data set while the label which I used it mean value
	"""

	y = features['label']
	X = features.drop(DROP_COL,axis=1)
	X_train, X_test, y_train, y_test = train_test_split(
		X, y, test_size=0.33, random_state=42)
    
	return X_train, X_test, y_train, y_test

In [74]:
def draw_feature_importance(feature_importance, features_name_vec):

	feature_importance = 100.0 * (feature_importance / feature_importance.max())
	sorted_idx = np.argsort(feature_importance)
	pos = np.arange(sorted_idx.shape[0]) + .5
	plt.subplot(1, 2, 2)
	plt.barh(pos, feature_importance[sorted_idx], align='center')
	plt.yticks(pos, features_name_vec[sorted_idx])
	plt.xlabel('relative importance')
	plt.title('features importance')
	plt.savefig("feature_importance.png")
	plt.show()

In [4]:
features = pd.read_csv('../data/k562/midfile/train.csv')
features = features.fillna(method='pad')

In [5]:
X_train, X_test, y_train, y_test = load_data(features)

In [None]:
X_train.s

In [98]:
X_train[:100]

Unnamed: 0,ENCFF408YVC_avg_sigval,ENCFF408YVC_max_sigval,ENCFF408YVC_min_sigval,ENCFF408YVC_std_sigval,ENCFF408YVC_var_sigval,ENCFF408YVC_avg_peak,ENCFF408YVC_max_peak,ENCFF408YVC_min_peak,ENCFF408YVC_std_peak,ENCFF408YVC_var_peak,...,ENCFF895BJS_promoter_2k_max_sigval,ENCFF895BJS_promoter_2k_min_sigval,ENCFF895BJS_promoter_2k_std_sigval,ENCFF895BJS_promoter_2k_var_sigval,ENCFF895BJS_promoter_2k_avg_peak,ENCFF895BJS_promoter_2k_max_peak,ENCFF895BJS_promoter_2k_min_peak,ENCFF895BJS_promoter_2k_std_peak,ENCFF895BJS_promoter_2k_var_peak,ENCFF895BJS_promoter_2k_peak_density
30509,3.849510,3.84951,3.84951,4.982159,24.821910,290.000000,290.0,290.0,166.304941,27657.333333,...,8574.0,11.0,592.943342,351581.806811,75.0,75,75,0.0,0.0,36052
55339,6.391090,6.49409,6.28809,0.145664,0.021218,289.000000,515.0,63.0,319.612265,102152.000000,...,17334.0,2.0,609.580877,371588.845739,75.0,75,75,0.0,0.0,314754
54419,6.134710,6.13471,6.13471,0.691805,0.478594,281.000000,281.0,281.0,0.707107,0.500000,...,17334.0,2.0,609.580877,371588.845739,75.0,75,75,0.0,0.0,314754
73783,4.381930,4.38193,4.38193,3.813243,14.540822,115.000000,115.0,115.0,53.033009,2812.500000,...,17334.0,4.0,606.124265,367386.624116,75.0,75,75,0.0,0.0,265934
7334,3.618340,3.61834,3.61834,2.409884,5.807539,73.000000,73.0,73.0,115.258405,13284.500000,...,17334.0,7.0,600.952887,361144.372464,75.0,75,75,0.0,0.0,218532
36461,2.566340,2.56634,2.56634,0.505369,0.255398,501.000000,501.0,501.0,33.941125,1152.000000,...,17334.0,2.0,609.580877,371588.845739,75.0,75,75,0.0,0.0,314754
22341,5.247556,6.46994,3.83115,1.154335,1.332489,109.000000,176.0,65.0,41.683330,1737.500000,...,17334.0,2.0,609.580877,371588.845739,75.0,75,75,0.0,0.0,314754
76047,4.399440,4.39944,4.39944,0.944405,0.891902,396.000000,396.0,396.0,145.445179,21154.300000,...,17334.0,2.0,609.580877,371588.845739,75.0,75,75,0.0,0.0,314754
359,2.993870,2.99387,2.99387,1.595968,2.547115,247.000000,247.0,247.0,10.606602,112.500000,...,17334.0,7.0,601.230439,361478.040930,75.0,75,75,0.0,0.0,231234
91862,4.628180,4.62818,4.62818,0.996816,0.993641,109.000000,109.0,109.0,116.672619,13612.500000,...,17334.0,2.0,609.580877,371588.845739,75.0,75,75,0.0,0.0,314754


In [99]:
#save X_test to model folder in order to test wether the model has been saved successfully
y_test.to_pickle("../model_file/IMR90/y_test.pkl")
X_test.to_pickle("../model_file/IMR90/X_test.pkl")
y_train.to_pickle("../model_file/IMR90/y_train.pkl")
X_train.to_pickle("../model_file/IMR90/X_train.pkl")

In [100]:
valid_data = lgb.Dataset(X_train[:5000],label=y_train[:5000])
train_data = lgb.Dataset(X_train[5000:], label=y_train[5000:])

In [101]:
param = {'num_leaves':31, 'num_trees':200, 'objective':'binary'}
param['metric'] = 'binary_error'

In [102]:
num_round = 300
bst = lgb.train(param, train_data, num_round, valid_sets=[valid_data])



[1]	valid_0's binary_error: 0.337
[2]	valid_0's binary_error: 0.337
[3]	valid_0's binary_error: 0.2692
[4]	valid_0's binary_error: 0.2226
[5]	valid_0's binary_error: 0.2054
[6]	valid_0's binary_error: 0.196
[7]	valid_0's binary_error: 0.1902
[8]	valid_0's binary_error: 0.185
[9]	valid_0's binary_error: 0.1812
[10]	valid_0's binary_error: 0.1794
[11]	valid_0's binary_error: 0.1772
[12]	valid_0's binary_error: 0.1754
[13]	valid_0's binary_error: 0.1732
[14]	valid_0's binary_error: 0.1732
[15]	valid_0's binary_error: 0.1732
[16]	valid_0's binary_error: 0.171
[17]	valid_0's binary_error: 0.1698
[18]	valid_0's binary_error: 0.1696
[19]	valid_0's binary_error: 0.169
[20]	valid_0's binary_error: 0.168
[21]	valid_0's binary_error: 0.166
[22]	valid_0's binary_error: 0.1646
[23]	valid_0's binary_error: 0.1634
[24]	valid_0's binary_error: 0.162
[25]	valid_0's binary_error: 0.1608
[26]	valid_0's binary_error: 0.1604
[27]	valid_0's binary_error: 0.1608
[28]	valid_0's binary_error: 0.159
[29]	valid_

In [103]:
#save model
bst.save_model('../model_file/IMR90/bst.txt')

<lightgbm.basic.Booster at 0x7f67fe800198>

In [104]:
ypred = bst.predict(X_test)
y_pred = []
for item in ypred:
    if item>0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)
f1 = f1_score(y_true=y_test, y_pred=y_pred)
print(f1)

0.9396744720759969


In [34]:
feature_importance = bst.feature_importance()
print(feature_importance)

feature_name = X_train.columns.tolist()
feature_name = np.array(feature_name)
sorted_idx = np.argsort(feature_importance)
sorted_name = feature_name[sorted_idx]
df_draw = np.c_[feature_importance[sorted_idx], sorted_name]
df_draw = pd.DataFrame(df_draw, columns=['score','fname'])
df_draw = df_draw.sort_values(by='score',)
df_draw.to_csv('../data/GM12878/midfile/feature_importance_gm12878_categoty.csv', index=False)


[1 4 1 ... 0 0 0]


### chrom in tad and not in tad