In [1]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib 
from collections import Counter

In [2]:
data = pd.read_csv('C:/Users/Henry Lee/Desktop/Direct/project/PDB_data/newsummary_1000.csv', index_col = 'Protein')
df1 = pd.DataFrame.dropna(data)
# Rearrange our dataset 
df1 = df1[['has_missing_residues', 'Sequence Length', 'resolution', 'b_factor_gt50', 'b_factor_max',
           'Electrically Charged', 'Hydrophobic', 'Nonpolar Side Chains', 'Special']]
df1.head()

Unnamed: 0_level_0,has_missing_residues,Sequence Length,resolution,b_factor_gt50,b_factor_max,Electrically Charged,Hydrophobic,Nonpolar Side Chains,Special
Protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2y39,True,110.0,1.41,0,46.46,0.38,0.42,0.11,0.09
2o73,True,992.0,1.8,4,65.24,0.3,0.41,0.18,0.08
3d5m,True,1116.0,2.2,3225,107.96,0.23,0.41,0.22,0.11
1gey,True,335.0,2.3,190,87.44,0.21,0.45,0.2,0.11
4y79,True,287.0,2.1,282,98.34,0.28,0.34,0.21,0.13


In [3]:
# dataset
x, y = df1.iloc[:, 1:].values, df1.iloc[:, 0] # x = features, y = predict goal(has_missing_residues)
# Random Forest Model
RF_model = RandomForestClassifier(n_estimators=500) # set 500 decision trees
RF_model.fit(x, y) # fitting our training data
# output model
joblib.dump(RF_model, 'RandomForest_model.pkl')

['RandomForest_model.pkl']

In [4]:
# List importance of each feature in the model
# Print accuracy of predicted test data
#RF_model = joblib.load('RandomForest_model.pkl') #load model
feature_name = df1.columns[1:] # name of features
importances = RF_model.feature_importances_ # get importances
indices = np.argsort(importances)[::-1] # index of sort of importance
for i in range(x.shape[1]):
    print("%2d) %-*s %f" % (i + 1, 30, feature_name[indices[i]], importances[indices[i]]))

# 5-fold cross-validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(RF_model, x, y, cv = 5)
print('Accuracy of the prediction in 5-fold cross-validation = {:.2%}'.format(scores.mean()))

 1) b_factor_gt50                  0.183283
 2) Sequence Length                0.155884
 3) b_factor_max                   0.143815
 4) Electrically Charged           0.123692
 5) resolution                     0.116614
 6) Hydrophobic                    0.099938
 7) Nonpolar Side Chains           0.090007
 8) Special                        0.086767
Accuracy of the prediction in 5-fold cross-validation = 70.87%


In [5]:
# The path of single input data goes through the random forest model
feature_count_accum = []
for j, tree in enumerate(RF_model.estimators_):
    # matrix of nodes that input data go through(boolean)
    dense_matrix = tree.decision_path(x[150].reshape(1, -1)).todense() 
    #transform to array
    dense_sample = np.array(dense_matrix)[0] 
    # extract number of nodes that input data goes through
    node_position = np.where(dense_sample == 1)[0] 
    feature_count = []

    for i in range(len(node_position)):
        number = node_position[i]
        feature_count.append(feature_name[tree.tree_.feature[number]]) # feature name of specific node got from node_position
    feature_count_accum.extend(feature_count)
feature_order = Counter(feature_count_accum).most_common(8) # list 
print('Most used features of nodes that input data went through', feature_order)

Most used features of nodes that input data went through [('Nonpolar Side Chains', 1092), ('b_factor_gt50', 997), ('Sequence Length', 921), ('b_factor_max', 881), ('resolution', 813), ('Electrically Charged', 800), ('Hydrophobic', 696), ('Special', 547)]
