In [None]:
import os
import numpy as np
import pandas as pd
import glob

In [None]:
from Bio import SeqIO

# This creates a dictionary of all lengths of all contigs
sequence_lengths = {}
for seq_record in SeqIO.parse('File.S8.Chang_et_al.fasta', "fasta"):
    sequence_lengths[seq_record.id] = len(seq_record)

sequence_lengths['3R_5']

In [None]:
# The GFF file used in this analysis
gff_file = './File.S9.Chang_et_al.gff.txt'
df_tes = pd.read_csv(gff_file, delimiter='\t')

In [None]:
df = df_tes.copy()

# Reformating the name column into a simple_name column
df['simple_name'] = df.apply(lambda x : x['name'].split('"')[1].split(':')[1],axis=1)
df['len'] = df.apply(lambda x : (int(x['end']) - int(x['start'])) ,axis=1)

df

In [None]:
master_dict_tes = {}
master_dict_tes_means = {}
for n in df_tes.iloc:
    name = n['name'].split('"')[1].split(':')[1]
    
    contig = n['Contig']
    start = n['start']
    end =  n['end']

    # Filter so that the sequence is atleast 100 long
    if (end-start) < 100:
        continue

    # THIS IS TO REMOVE THE SATELITE REPEATS
    if ')n' in name:
        continue

    fns = glob.glob('R_gquad_results/*/*{}.csv'.format(contig))
    for fn in fns:

        if name not in master_dict_tes:
            master_dict_tes[name] = np.zeros((int(end-start),))
            master_dict_tes_means[name] = []
        
            
        arr = master_dict_tes[name].copy()

        df = pd.read_csv(fn)
        if len(df[df['sequence_position'] == '-']) >= 1:
            continue
        df = df[df['sequence_position'] >= start ]
        df = df[df['sequence_position'] <= end ]

        try:

            for s_pos, s_len, s_lik in zip(df['sequence_position'], df['sequence_length'], df['likeliness']):
                if s_lik == '*':
                    num = 1
                elif s_lik == '**':
                    num = 2
                elif s_lik == '***':
                    num = 3
                
                s_start = s_pos - start # Puts it in the range of arr
                arr[int(s_start): (int(s_start) + int(s_len) + 1)] += num
                
        except:

            for s_pos, s_len in zip(df['sequence_position'], df['sequence_length']):
                num = 2
                
                s_start = s_pos - start
                arr[int(s_start): (int(s_start) + int(s_len) + 1)] += num
        
        # Debug
        master_dict_tes[name] = np.maximum(master_dict_tes[name], arr.copy()).copy()
    
    # Record the mean
    master_dict_tes_means[name].append(master_dict_tes[name].mean())
    # Clear the array to get new mean
    master_dict_tes[name] = np.zeros((int(end-start),))

In [None]:
master_te_total_averages = []

for key, value in master_dict_tes_means.items():
    item_to_add = (key, np.mean(master_dict_tes_means[key]))
    master_te_total_averages.append(item_to_add)


In [None]:
master_te_total_averages = sorted(master_te_total_averages, key=lambda x: x[1], reverse=True)

The final DataFrame

In [None]:
total_te_df = pd.DataFrame(master_te_total_averages)
total_te_df.to_csv('tes_averages_ranked.csv')
total_te_df