In [15]:
from scipy.stats import entropy
import numpy as np

def calculate_relative_entropy(motif, predictedmotif):
  motif_PWM = open(motif, 'r')
  p_motif_PWM = open(predictedmotif, 'r')

  motif_header = motif_PWM.readline() 
  p_motif_header = p_motif_PWM.readline() 
  temp = motif_header.split()
  number_of_rows = int(temp[1]) 

  PWM_array = np.zeros((number_of_rows, 4))
  pred_PWM_array = np.zeros((number_of_rows, 4))

  index = 0
  entropy_array = []

  while (True):
    line = motif_PWM.readline()
    line = line.split()
    p_line = p_motif_PWM.readline()
    p_line = p_line.split()

    if (line[0] == "<"):
      break

    for i in range(4):
      if (float(line[i]) == 0.0):
        PWM_array[index][i] = 1.0e-16
      else:
        PWM_array[index][i] = float(line[i])
        
      if (float(p_line[i]) == 0.0):
        pred_PWM_array[index][i] = 1.0e-16
      else:
        pred_PWM_array[index][i] = float(p_line[i])
    
    entropy_array.append(entropy(pred_PWM_array[index][:], qk=PWM_array[index][:], base=2, axis=0))
    index += 1    

  motif_PWM.close()
  p_motif_PWM.close()
  return entropy_array

In [16]:
def calculate_result_overlap(sites, predictedsites, motif_length):
  length_handle = open(motif_length, 'r')
  temp = length_handle.readline().split() 
  length = int(temp[0]) 

  sites_list = []
  predictedsites_list = []

  with open(sites) as sites_handle:
    for line1 in sites_handle:
        sites_list.append(int(line1))

  with open(predictedsites) as predictedsites_handle:
    for line2 in predictedsites_handle:
        predictedsites_list.append(int(line2))

  list_length = len(sites_list) 
  number_perfect_matches = 0
  best_case_overlapping_pos = length * list_length 
  overlap_counter = 0

  for i in range(list_length):
    site_i = sites_list[i]
    p_site_i = predictedsites_list[i]

    if (site_i == p_site_i):
      number_perfect_matches += 1
      overlap_counter += length
    else:
      position_diff = abs(site_i - p_site_i)
      overlap_length = length - position_diff

      if (overlap_length > 0):
        overlap_counter += overlap_length
  
  overlap_ratio = overlap_counter / best_case_overlapping_pos

  #Uncomment 1 of the 2 print statements below according to your needs

  print("{0}/{1} perfect matches, ratio of overlap = {2}".format(number_perfect_matches, list_length, overlap_ratio))
  #print(overlap_ratio)

In [17]:
from collections import OrderedDict

def organize(filename):
    file_handle = open(filename, 'r')
    dict = {}
    i = 0
    while (i < 100):
        i += 1
        data_line = file_handle.readline()
        temp = data_line.split()
        dict[temp[0]] = temp[3]

    j = 1
    while (j < 101):
        current_file = "data_set_"+str(j)
        print(dict[current_file])
        j += 1
        
organize("run2.txt")

1456.4828859000008
1429.9108263000016
1426.4832652000005
1497.3192337
1517.2834447999994
1618.760872499999
1617.091994800001
1615.7746905999993
1561.3533486000015
1644.5240597999982
1635.0659563000008
1647.1807657000008
1560.9438923999987
1621.4442813999995
1632.8075968000012
1633.4372311999996
1613.0735912000018
1647.6980773999985
1642.1950927999987
1628.8069895999997
1598.7705666000002
1615.9024327999978
1621.8386991000007
1611.3261669999993
1623.5828908000003
1627.103529600001
1628.3239255000008
1631.5726976999995
1625.0125322999993
1608.496390299999
1622.6518265999985
1606.7555046999987
1621.9739238000002
1632.6511617999986
1626.1240992000003
1614.0794243
1609.554666
1616.7116975000008
1621.0006660000017
1610.9649556999993
1290.3500554999991
1292.089393100001
1295.7693788999995
1294.9764186000002
1300.4109461000007
1299.7770120999994
1291.1595129999987
1296.5365613999993
1296.0363439000012
1307.9193745000011
1436.1528311000002
1429.9402831999996
1455.3567857999988
1450.545986799999

In [18]:
# An automated way to calculate relative entropy for all 100 datasets

for i in range(1, 101):
    current_folder = "data_set_"+str(i)
    raw_val = calculate_relative_entropy(current_folder+'/motif.txt', current_folder+'/predictedmotif.txt')
    sum = 0
    length = len(raw_val)
    for v in raw_val:
        sum += v
    average = sum / length
    
    #Uncomment 1 of the 4 print statements below according to your needs
    
    print("Average = {0}, Sum = {1}, Raw Values = {2}".format(average, sum, raw_val))
    #print(sum)
    #print(average)
    #print(raw_val)

Average = 0.0, Sum = 0.0, Raw Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Average = 0.0, Sum = 0.0, Raw Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Average = 0.0, Sum = 0.0, Raw Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Average = 0.0, Sum = 0.0, Raw Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Average = 0.0, Sum = 0.0, Raw Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Average = 0.0, Sum = 0.0, Raw Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Average = 0.0, Sum = 0.0, Raw Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Average = 0.0, Sum = 0.0, Raw Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Average = 0.0, Sum = 0.0, Raw Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Average = 0.0, Sum = 0.0, Raw Values = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Average = 2.5421871404864778, Sum = 20.337497123891822, Raw Values = [3.5150803000741484, 3.984075893663425, 3.984075893663426, 3.013125299208761, 1.9985402777643975, 0.5262139888732773, 3.01312529920

In [19]:
# An automated way to calculate overlap for all 100 datasets

for i in range(1, 101):
    current_folder = "data_set_"+str(i)
    calculate_result_overlap(current_folder+'/sites.txt', current_folder+'/predictedsites.txt', current_folder+'/motiflength.txt') # This will print everything out

10/10 perfect matches, ratio of overlap = 1.0
10/10 perfect matches, ratio of overlap = 1.0
9/10 perfect matches, ratio of overlap = 0.9
10/10 perfect matches, ratio of overlap = 1.0
10/10 perfect matches, ratio of overlap = 1.0
10/10 perfect matches, ratio of overlap = 1.0
10/10 perfect matches, ratio of overlap = 1.0
10/10 perfect matches, ratio of overlap = 1.0
10/10 perfect matches, ratio of overlap = 1.0
10/10 perfect matches, ratio of overlap = 1.0
0/10 perfect matches, ratio of overlap = 0.0875
0/10 perfect matches, ratio of overlap = 0.0
0/10 perfect matches, ratio of overlap = 0.0
9/10 perfect matches, ratio of overlap = 0.9
0/10 perfect matches, ratio of overlap = 0.025
3/10 perfect matches, ratio of overlap = 0.475
0/10 perfect matches, ratio of overlap = 0.0
0/10 perfect matches, ratio of overlap = 0.0
10/10 perfect matches, ratio of overlap = 1.0
6/10 perfect matches, ratio of overlap = 0.6
0/10 perfect matches, ratio of overlap = 0.0
10/10 perfect matches, ratio of overla