In [6]:
# decision tree
import pandas as pd
def splitting_function(data,motif):
    number_of_records = len(data.index)
    left_node_data = data[data[motif] == 1]
    right_node_data = data[data[motif] == 0]
    
    number_of_records_left = len(left_node_data.index)
    number_of_records_right = len(right_node_data.index)
    if number_of_records_left == 0 or number_of_records_right == 0:
        return 0.0
    
    P_l = number_of_records_left/float(number_of_records)
    P_r = 1 - P_l
    P_j1_left = len(left_node_data[left_node_data['Class'] == 1].index)/float(number_of_records_left)
    P_j1_right = len(right_node_data[right_node_data['Class'] == 1].index)/float(number_of_records_right)
    P_j_1_left = 1 - P_j1_left
    P_j_1_right = 1 - P_j1_right
   
    Phi = 2*P_l*P_r*(abs(P_j1_left-P_j1_right)+abs(P_j_1_left-P_j_1_right))
    return Phi
def find_best_motif(data):
    my_dict = {}
    for i in data.columns[:-1]:
        my_dict[i] = splitting_function(data,i)
    sorted_dict = sorted(my_dict, key=my_dict.get,reverse=True)
#     print sorted_dict
    return sorted_dict[0],my_dict[sorted_dict[0]]
class Node:
    def __init__(self,motif,data):
        self.left = False
        self.right = False
        self.motif = motif
        self.data = data
        self.leaf = False
        self.label = ""
    def insert_left(self):
        left_data = self.data[self.data[self.motif] == 1]
        best_motif,best_phi = find_best_motif(left_data)
        # check if this node will be a leaf, meaning that we will stop insert node
        if best_phi == 0 or len(left_data.index) <= 1:
            label = find_label_leaf(left_data)
            leaf_node = Node("leaf",left_data)
            leaf_node.leaf = True
            leaf_node.label = label
            self.left = leaf_node
        else:
            left_node = Node(best_motif,left_data)
            self.left = left_node
    def insert_right(self):
        right_data = self.data[self.data[self.motif] == 0]
        best_motif,best_phi = find_best_motif(right_data)
        # check if this node will be a leaf, meaning that we will stop insert node
        if best_phi == 0 or len(right_data.index) <= 1:
            label = find_label_leaf(right_data)
            leaf_node = Node("leaf",right_data)
            leaf_node.leaf = True
            leaf_node.label = label
            self.right = leaf_node
        else:
            right_node = Node(best_motif,right_data)
            self.right = right_node     
def inorder_traversal(node):
    print node.motif
    if node.leaf:
        print node.label    
    if node.left:
        inorder_traversal(node.left)
    if node.right:
        inorder_traversal(node.right)
def decision_tree_prediction(node,data,seq):
    # check leaf node
    if node.leaf:
#         print "node",node.motif
#         print "node",node.label
        return node.label
#         if node.label == 1:
#             return 1
    value = data.at[seq,node.motif]
#     print "value",value
    if value == 1:
        next_node = node.left
    else:
        next_node = node.right
#     print "next_node",next_node.motif
    return decision_tree_prediction(next_node,data,seq)
def find_label_leaf(data):
    pos_num = len(data[data['Class'] == 1].index)
    neg_num = len(data[data['Class'] == -1].index)
    if pos_num >= neg_num:
        return 1
    else:
        return -1
def grow_tree(node):
    if not node.leaf:
        node.insert_left()
        node.insert_right()
        grow_tree(node.left)
        grow_tree(node.right)    
    

In [9]:
# data
my_data = pd.DataFrame()
my_data['m1'] = [1,0,0,1,0,0]
my_data['m2'] = [0,1,0,0,0,0]
my_data['m3'] = [1,1,1,1,1,0]
my_data['class'] = [1,1,1,-1,-1,-1]
my_data.index = ['s1','s2','s3','s4','s5','s6']
my_data.ix[['s1','s1']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


Unnamed: 0,m1,m2,m3,class
s1,1,0,1,1
s1,1,0,1,1


In [3]:
# train a decision tree
root = Node("m2",my_data)
grow_tree(root)
inorder_traversal(root)

m2
leaf
1
m3
leaf
1
leaf
-1


In [4]:
# predict using a trained decision tree
for i in my_data.index.tolist():
    print i,decision_tree_prediction(root,my_data,i)

s1 1
s2 1
s3 1
s4 1
s5 1
s6 -1


In [5]:
srebp = pd.read_csv("final_datasets/SREBP.training.csv",index_col = 0)

In [7]:
find_best_motif(srebp)

('SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_3',
 0.5665939297482403)

In [8]:
root = Node("SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_3",srebp)
grow_tree(root)
inorder_traversal(root)

SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_3
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_MEME_1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_MEME_2
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_3
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_1
leaf
-1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_MDscan_1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_MDscan_3
leaf
1
leaf
-1
leaf
1
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_2
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_3
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_2
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_MEME_2
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_1
leaf
1
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_2
SREBP_SREBF1_HepG2_encode-Snyder_seq

In [13]:
from random import choice,sample
def bootstrap_sampling(data):
    my_pool = data.index.tolist()
    my_sample = [choice(my_pool) for _ in range(len(my_pool))]
    return data.ix[my_sample]
def train_a_tree_random_forest(data,num_motif):
    feature_list = data.columns.tolist()[:-1]
    my_sample = bootstrap_sampling(data)
    my_features = sample(feature_list,num_motif)
    my_training_set = my_sample[my_features+['Class']]
    best_motif = find_best_motif(my_training_set)[0]
    root = Node(best_motif,my_training_set)
    grow_tree(root)
    return root
  

In [20]:
# random forest main code
class random_forest_classifier:
    def __init__(self,num_tree,num_motif):
        self.num_tree = num_tree
        self.num_motif = num_motif
    def fit(self,data):
        self.trees = []
        for i in range(self.num_tree):
            self.trees.append(train_a_tree_random_forest(data,self.num_motif))  
    def predict(self,data):
        for i in data.index.tolist():
            tree_result = []
            for tree in self.trees:
                tree_result.append(decision_tree_prediction(tree,data,i))
            if sum(tree_result) >= 0:
                print i,1
            else:
                print i,-1
    

In [22]:
my_rf = random_forest_classifier(3,7)
my_rf.fit(srebp)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """


In [28]:
inorder_traversal(my_rf.trees[2])

SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_3
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_2
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_MDscan_2
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_2
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_3
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_MDscan_1
leaf
1
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_MDscan_1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_1
leaf
-1
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_1
leaf
1
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_MDscan_2
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_AlignACE_1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_3
leaf
1
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_3
leaf
-1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_2
leaf
-1
leaf
1
SREBP_SREBF1_HepG2_encode-Snyder_seq_hsa_insulin_Trawler_2
leaf
1
SREBP_SREBF1_

In [29]:
my_rf.predict(srebp)

chr3:195163762-195164066 1
chr12:24720923-24721791 -1
chr15:101947492-101947562 -1
chr21:44394620-44394755 1
chr14:34801397-34801701 -1
chr3:50337142-50337418 1
chr8:126010439-126010709 1
chr1:11866042-11866312 1
chr2:207207793-207208132 1
chr19:2061553-2061829 1
chr20:382442-382663 -1
chr11:43701990-43702166 1
chr11:64879129-64879433 1
chr12:74234680-74234784 -1
chr18:52785077-52785785 -1
chr18:19072933-19073545 -1
chr1:149224375-149224645 1
chr8:13647881-13648234 -1
chr9:86323455-86323725 1
chr8:140847168-140847355 -1
chr6:43774150-43774420 -1
chr16:71598699-71598969 1
chr20:52380313-52380589 -1
chr2:132952317-132953081 -1
chr1:162760321-162760407 1
chr2:220094240-220094544 1
chr1:155079834-155080104 1
chr10:102106629-102106883 1
chr12:12545001-12545129 -1
chr1:161864640-161864746 -1
chr3:76134-76616 -1
chr2:239228919-239229223 1
chr7:91764082-91764192 1
chr6:53213772-53214652 1
chr2:120301861-120302165 1
chr6:35370438-35370708 1
chr14:23755323-23755627 1
chr2:223725592-223725862 1
c

chr1:171788651-171789064 -1
chr2:28592686-28592909 1
chr10:93068947-93069202 -1
chr8:17764825-17765077 1
chr13:87829864-87829987 -1
chr15:43538518-43538578 1
chr14:98236329-98236684 1
chr3:45883545-45883821 1
chr16:58529057-58529327 1
chr7:132317018-132317134 -1
chr15:76327096-76327220 -1
chr9:101633869-101634470 -1
chr2:33747170-33748204 -1
chr8:10587944-10588213 1
chr5:20594214-20594469 -1
chr17:18510023-18510316 -1
chr12:120826062-120826150 -1
chr13:19963271-19963431 -1
chr6:11537577-11537881 1
chr2:203453474-203453778 1
chr5:61621966-61622172 -1
chr19:2783277-2783581 1
chr18:4411986-4412165 -1
chr13:28485408-28485743 -1
chrX:8774327-8774767 -1
chrX:10343303-10343697 -1
chr2:239148712-239148819 1
chr5:43313724-43313959 1
chr7:29247971-29248275 1
chr10:99340927-99341072 -1
chr1:32671049-32671353 1
chr11:6502685-6502807 1
chr5:29978048-29978282 -1
chr12:1586310-1586420 -1
chr1:161193371-161193535 1
chr1:23119118-23119200 -1
chr4:629400-629668 -1
chr19:18058641-18058945 1
chr4:11653157

chr9:92117921-92117986 -1
chr7:71688029-71688606 1
chr6:52028734-52028922 -1
chr12:107301362-107302051 1
chr19:19634943-19635118 1
chr11:118559195-118559465 -1
chr15:96873697-96874001 1
chr21:47648734-47648955 1
chr16:30064297-30064601 1
chr3:133465038-133465168 1
chr4:166248520-166248822 1
chr6:28456723-28456999 -1
chr17:80477387-80477657 1
chr20:44718554-44718858 1
chr22:23487398-23487702 1
chr1:57362751-57363097 -1
chr20:8000334-8000638 1
chr2:232469470-232469597 1
chr11:24848960-24849078 -1
chr6:166796414-166796718 1
chr14:68161477-68161753 -1
chr16:69760273-69760577 1
chrX:53644408-53644714 -1
chr10:25013498-25013802 1
chr1:215060129-215060406 -1
chr10:91403645-91403915 -1
chr2:232063179-232063483 1
chr2:47261487-47261763 -1
chr14:21152025-21152301 -1
chr2:189740010-189740414 -1
chr18:75809561-75809889 -1
chr12:120933580-120933884 1
chr6:90348454-90348589 -1
chr19:1940681-1940957 1
chr16:27542465-27543379 1
chr18:72483600-72483985 -1
chr19:893265-893541 -1
chr6:85067986-85068198 -

In [30]:
srebp_test = pd.read_csv("final_datasets/SREBP.testing.csv",index_col=0)