In [None]:
import os
import numpy as np
import pandas as pd
import glob

In [None]:
from Bio import SeqIO

# # This creates a dictionary of all lengths of all contigs
sequence_lengths = {}
for seq_record in SeqIO.parse('ref/dm6.fasta', "fasta"):
    sequence_lengths[seq_record.id] = len(seq_record)

# Ranking contigs based on G4Hunter scores

In [None]:
# Theses set of loops create a dictionary where the key is a contig and the value us an array of 0 and 1s where there a G4 exists in that area.

threshold = 1
master_dict_contig = {}
for n in sequence_lengths.keys():
    fns = glob.glob('results/Results_dm6_threshold_{}/{}.split.txt'.format(threshold, n))
    for fn in fns:
        if n not in master_dict_contig:
            master_dict_contig[n] = np.zeros((sequence_lengths[n],))
        
            
        arr = master_dict_contig[n].copy()

        df = pd.read_csv(fn, delimiter='\t')

        for r in df.iloc:
            start = r['Start']
            end = r['End']
            arr[start: (end + 1)] += 1

        master_dict_contig[n] = arr.copy()

# Get the mean for each contig and replace the array
for key, values in master_dict_contig.items():
    master_dict_contig[key] = values.mean()

# convert to list
master_list_contig = []
for key, value in master_dict_contig.items():
    master_list_contig.append([key, value])


In [None]:
import csv

# Sort the averages
master_list_contig.sort(key = lambda x: x[1], reverse=True)

# field names 
fields = ['Name', 'Density of G4s per Contig'] 
    
# data rows of csv file 
rows = master_list_contig
  
with open('rank_contigs_with_G4Hunter_threshold_{}.csv'.format(threshold), 'w') as f:
      
    # using csv.writer method from CSV package
    write = csv.writer(f)
      
    write.writerow(fields)
    write.writerows(rows)

# Determining enrichement

In [None]:
# Change the centromere to analyze the different centromeres
centromere = 'Y'
cen_len = {'2': 155000, '3': 103827, '4': 109000, 'X': 158600, 'Y': 139957}
master_dict = {}
cen_master_dict = {}
threshold = 2

# For controls
for n in os.listdir("control/long_dna/C{}".format(centromere)):
    contig = '_'.join(n.split('-')[2].split('_')[:-1])
    start = int(n.split('-')[2].split('_')[-1])
    end =  int(n.split('-')[3])
    
    contig_identifier = '-'.join([contig, str(start)])

    # The G4 file to open for each control contig
    fn = 'results/Results_dm6_threshold_{}/{}.split.txt'.format(threshold, contig)

    if contig_identifier not in master_dict:
        master_dict[contig_identifier] = np.zeros((cen_len[centromere],))
    
    # For contigs where G4Hunter found no G4s
    if not os.path.isfile(fn):
        continue
        
    arr = master_dict[contig_identifier].copy()

    df = pd.read_csv(fn, delimiter='\t')
    df['Score'] = abs(df['Score'])
    df = df[df['Start'] >= start ]
    df = df[df['End'] <= end ]

    for r in df.iloc:
        inner_start = r['Start'] - start
        inner_end = r['End'] - end
        arr[inner_start: (inner_end + 1)] += 1
    
    master_dict[contig_identifier] = arr.copy()


#For the centromere
for n in [n.replace('\uf03a', ':') for n in os.listdir('cen/dna/C{}'.format(centromere))]:
    if '.fai' in n:
        continue
    contig = n.split('-')[1]
    
    contig_identifier = contig

    # The G4 file to open for each control contig
    fn = 'results/Results_dm6_threshold_{}/{}.split.txt'.format(threshold, contig)
    
    if contig_identifier not in cen_master_dict:
        cen_master_dict[contig_identifier] = np.zeros((cen_len[centromere],))
    
    arr = cen_master_dict[contig_identifier].copy()

    
    df = pd.read_csv(fn, delimiter='\t')
    df['Score'] = abs(df['Score'])
    for r in df.iloc:
        inner_start = r['Start']
        inner_end = r['End']
        arr[inner_start: (inner_end + 1)] += 1
    
    cen_master_dict[contig_identifier] = arr.copy()


all_ks_avgs = []
cen_iden = list(cen_master_dict.keys())[0]

for key, values in master_dict.items():
    all_ks_avgs.append(ks_2samp(cen_master_dict[cen_iden], values)[1])
print('Average P Value for centromere', centromere, ":", np.mean(np.array(all_ks_avgs)))


hue = []
data = []
names = []
t_cen = []
t_con = []
minl = 0

sdata = np.mean(cen_master_dict[cen_iden])
t_cen.append(sdata)
data.append(sdata)
hue.append(1)
names.append('Cen')
np.savetxt("".join(["control/csv-results/centromere-", centromere, "-G4Hunter-threshold-" + str(threshold) + "-.csv"]), data, delimiter=",")

data = []
names = []

for key, values in master_dict.items():
    sdata = np.mean(values)
    t_con.append(sdata)
    data.append(sdata)
    hue.append(0)
    names.append('Control')
np.savetxt("".join(["control/csv-results/long-control-", centromere, "-G4Hunter-threshold-" + str(threshold) + "-long-.csv"]), data, delimiter=",")

In [None]:
# One sample T test
from scipy.stats import ttest_1samp
import pandas as pd
cen_len_index = 'Y'
threshold = 1

ttest_1samp(
    pd.read_csv("".join(["control/csv-results/long-control-", cen_len_index, "-G4Hunter-threshold-" + str(threshold) + "-long-.csv"]), header=None).values,
    float(pd.read_csv("".join(["control/csv-results/long-centromere-", cen_len_index, "-G4Hunter-threshold-" + str(threshold) + "-long-.csv"]), header=None).values)
)