In [1]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np
import glob, os
import itertools

In [2]:
ccp = {'a_alpina' : 60.02,
    'a_thaliana' : 60.02, 
 'b_gravinae': 32.24,
 'b_oleracea' : 50.52,
 'b_tournefortii': 47.46,
 'c_annua' : 55.33,
 'c_gynandra' : 4.28,
 'd_erucoides': 33.04,
 'd_tenuifolia' : 12.68,
 'd_viminea' : 51.08,
 'h_incana1' : 50.50,
 'h_incana3' : 38.37,
 'm_arvensis' : 24.42,
 'm_moricandioides' : 51.59,
 'm_nitens' : 21.04,
 'm_suffruticosa' : 24.87}

In [46]:
#make a dict with every csv in folder
os.chdir("<output_files_from_03_annotations/>")
files = {}
for file in glob.glob("*_save.csv"):
    raw = pd.read_csv(file)
    csv = raw[raw['location'].isin(['upstream'])] #delete all rows containing TIR TSD and repeat info
    files['_'.join(file.split('_')[:2])] = csv

In [47]:
ogs = []
for i in files.keys():
    ogs.append(list(files[i]['orthogroups']))
ogs = list(set(list(itertools.chain.from_iterable(ogs))))

In [48]:
#loop over species and OGS and write, how many TEs are present in a certain OG

df_ogs = pd.DataFrame()
df_ogs['orthogroup'] = ogs
for f in files.keys():
    hits = []
    for i in ogs:
        hits.append(len(files[f][files[f]['orthogroups'] == i]))
    df_ogs[f] = hits

In [52]:
df1_transposed = df_ogs.T 
df1_transposed.columns = df1_transposed.iloc[0]
df1_transposed = df1_transposed.drop(df1_transposed.index[0])
ccp_list = []

for i in list(df1_transposed.index):
    ccp_list.append(ccp[i])
df1_transposed['CCP'] = ccp_list

c34 = []
for i in ccp_list:
    if i <= 40:
        c34.append(1)
    else:
        c34.append(0)
df1_transposed['C34'] = c34

In [54]:
df1_transposed

orthogroup,OG0032136,OG0014429,OG0003817,OG0001837,OG0017459,OG0002369,OG0018644,OG0008954,OG0003077,OG0015745,...,OG0007151,OG0011815,OG0003186,OG0024212,OG0067887,OG0048571,OG0007369,OG0013283,CCP,C34
d_erucoides,1,0,0,1,0,0,1,1,0,0,...,0,1,0,0,0,0,1,0,33.04,1
d_viminea,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,1,51.08,0
b_gravinae,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,32.24,1
m_moricandioides,0,0,1,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,51.59,0
m_arvensis,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,24.42,1
d_tenuifolia,0,0,2,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,12.68,1
h_incana1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,50.5,0
m_nitens,0,0,1,0,1,1,0,0,1,0,...,1,0,0,0,0,0,0,1,21.04,1
h_incana3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,1,38.37,1
a_thaliana,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,60.02,0


### perform one-way-ANOVA: presence of TE versus CCP:

In [None]:
p_values = {}
rsq_values = {}
for i in list(df1_transposed.columns)[:-1]:
    model = ols('CCP ~' + i, data = df1_transposed).fit()
    anova_table = sm.stats.anova_lm(model, typ = 1)
    d, s, d, f, p = anova_table.loc[i]
    p_values[i] = p
    rsq_values[i] = model.rsquared
    


In [None]:
df_p = pd.DataFrame()
df_p['OG'] = p_values.keys()
df_p['p'] = p_values.values()

df_p = df_p.sort_values(by = ['p'])
df_p[df_p['p']<=0.035]
