In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('49_KRAB-ZFP_peaks_200bp_TE.interval', sep='\t', header=None, names=['Chr', 'Start', 'End', 'Length', 'Summit', 'Tags', 'Enrichment', 'FDR', 'ZFP', 'TE'])
unique_ZFPs = set(df.ZFP.tolist())

for zfp in unique_ZFPs:
    
    df_zfp = df[df['ZFP'] == zfp]
    TE_count = df_zfp.TE.value_counts()
    ZFP_count = df_zfp.ZFP.value_counts()
    Zfp_number = ZFP_count[0]
    percent_TE = TE_count /Zfp_number * 100
    merged = pd.concat([TE_count, percent_TE], axis=1)
    merged.columns = ['TE_count', 'TE_percentage']
    merged = merged[merged['TE_count'] >= 3]
    merged = merged[merged['TE_percentage'] >= 2]
    print(zfp)
    print(merged, '\n')




Zfp617
        TE_count  TE_percentage
ORR1A2       261      30.243337
.            252      29.200463
ORR1A0        63       7.300116
ORR1A1        58       6.720742
ORR1A3        58       6.720742
ORR1B1        57       6.604867
ORR1A4        33       3.823870 

Zfp58
        TE_count  TE_percentage
.             73      49.324324
MTC           33      22.297297
MTA_Mm        11       7.432432
MTD            9       6.081081
MTB            3       2.027027 

Zfp938
        TE_count  TE_percentage
.            104      51.231527
RMER6C        22      10.837438
RMER6A        10       4.926108 

Zfp46
     TE_count  TE_percentage
.         332      91.712707
L2b         9       2.486188
L2c         8       2.209945 

Gm14399
           TE_count  TE_percentage
.               141      43.788820
IAPEz-int        86      26.708075
L1Md_A            7       2.173913
L1Md_F2           7       2.173913 

Zfp738
           TE_count  TE_percentage
.               138      78.409091
MERVL-int   

In [30]:

# set read numbers to calculate RPM
KAP1_ChIP = 61808121      
KAP1_Input = 115085987
H3K9me3_ChIP = 43340454
H3K9me3_Input = 46710977

# load data with KAP1 and H3K9me3 enrichment as dataframe
df = pd.read_csv('49_KRAB-ZFP_peaks_200bp_TE_KAP1_H3K9me3.interval', sep='\t', header=None, names=['Chr', 'Start', 'End', 'Enrichment', 'ZFP', 'TE', 'KAP1_ChIP', 'KAP1_Input', 'H3K9me3_ChIP', 'H3K9me3_Input'])

# generate set with unique ZFP names
unique_ZFPs = set(df.ZFP.tolist())     # 
print(unique_ZFPs)

# Load data with fisher bed values as dataframe
matrix = pd.read_csv('fisher_matrix.txt', sep='\t')

# calculate RPM values
df['KAP1_ChIP'] = df['KAP1_ChIP'] / KAP1_ChIP * 1000000
df['KAP1_Input'] = df['KAP1_Input'] / KAP1_Input * 1000000
df['H3K9me3_ChIP'] = df['H3K9me3_ChIP'] / H3K9me3_ChIP * 1000000
df['H3K9me3_Input'] = df['H3K9me3_Input'] / H3K9me3_Input * 1000000

# Iterate through unique ZFP set
for zfp in unique_ZFPs:
    
    df_zfp = df[df['ZFP'] == zfp]                    # Filter read count table for ZFP name
    matrix_zfp_sign = matrix[matrix[zfp] < 10e-5]    # Filter fisher bed table for significant overlapping TEs     
    TE_list = matrix_zfp_sign.Repeat.tolist()        # Make list of TEs that are significantly bound by ZFP
    TE_list.append('.')                              # Add '.' to list to include peaks that do not overlap with TEs
    print(TE_list)
    
    # Iterate through list of TEs that are significantly bound by ZFP
    for TE in TE_list:
        df_zfp_TE = df_zfp[df_zfp['TE'] == TE]                  # Filter read count table for TE name
        KAP1_ChIP_mean = df_zfp_TE['KAP1_ChIP'].mean()          # Calculate Mean RPM values
        KAP1_Input_mean = df_zfp_TE['KAP1_Input'].mean()
        H3K9me3_ChIP_mean = df_zfp_TE['H3K9me3_ChIP'].mean()
        H3K9me3_Input_mean = df_zfp_TE['H3K9me3_Input'].mean()
        
        KAP1_enrichment = KAP1_ChIP_mean / KAP1_Input_mean
        H3K9me3_enrichment = H3K9me3_ChIP_mean / H3K9me3_Input_mean
        print(zfp, 'binding sites in:',TE)
        print('KAP1_ChIP enrichment:', KAP1_enrichment)
        print('H3K9me3_ChIP enrichment:', H3K9me3_enrichment, '\n')
    
    


{'Zfp617', 'Zfp58', 'Zfp938', 'Zfp46', 'Gm14399', 'Zfp738', 'Zfp92', 'Zfp429', 'Zfp599', 'Gm13139', 'Zfp759', 'Gm4631', 'Gm14420', 'Gm13154', 'Zfp458', 'Zfp961', 'LOC102', 'Gm30910', 'AW146154', 'Gm14401', 'Zfp953', 'Zfp661', 'Zfp808', 'Zfp810', 'Rex2', 'Gm14295', 'Zfp493', 'Zfp595', 'Zfp882', 'Gm14419', 'Zfp534', 'Gm21082', 'Zfp935', 'Zfp600', 'Zfp712', 'Zfp931', 'Gm13051', 'Gm14443', 'Gm13152', 'Gm8893', 'Zfp273', 'Zfp874a', 'Gm13157', 'Zfp874b', 'Zfp456', 'Zfp457', 'Gm13251', 'Gm13150', 'Gm13225'}
['ORR1A0', 'ORR1A1', 'ORR1A2', 'ORR1A3', 'ORR1A4', 'ORR1B1', 'ORR1B2', 'ORR1C1', 'ORR1C2', '(CATTC)n', '.']
Zfp617 binding sites in: ORR1A0
KAP1_ChIP enrichment: 3.5275555334630164
H3K9me3_ChIP enrichment: 1.461135847041265 

Zfp617 binding sites in: ORR1A1
KAP1_ChIP enrichment: 2.7333821182526212
H3K9me3_ChIP enrichment: 1.0411097276116075 

Zfp617 binding sites in: ORR1A2
KAP1_ChIP enrichment: 2.780646676396391
H3K9me3_ChIP enrichment: 0.7075654832090206 

Zfp617 binding sites in: ORR1A3


['RLTR25A', 'RLTR25B', '.']
Zfp458 binding sites in: RLTR25A
KAP1_ChIP enrichment: 7.335104713779759
H3K9me3_ChIP enrichment: 2.9393686931928227 

Zfp458 binding sites in: RLTR25B
KAP1_ChIP enrichment: 3.9373290446803373
H3K9me3_ChIP enrichment: 1.7117500036828794 

Zfp458 binding sites in: .
KAP1_ChIP enrichment: nan
H3K9me3_ChIP enrichment: nan 

['ETnERV-int', 'ETnERV2-int', 'ETnERV3-int', 'MMETn-int', 'RLTR3_Mm', 'RLTR45-int', 'RLTR45', 'RLTR9E', 'RLTRETN_Mm', 'TSS', '.']
Zfp961 binding sites in: ETnERV-int
KAP1_ChIP enrichment: 8.059918000734731
H3K9me3_ChIP enrichment: 6.666369945696658 

Zfp961 binding sites in: ETnERV2-int
KAP1_ChIP enrichment: 7.710331756886551
H3K9me3_ChIP enrichment: 7.956810920983462 

Zfp961 binding sites in: ETnERV3-int
KAP1_ChIP enrichment: 9.834862780926715
H3K9me3_ChIP enrichment: 4.311074083349473 

Zfp961 binding sites in: MMETn-int
KAP1_ChIP enrichment: 9.880245275880466
H3K9me3_ChIP enrichment: 6.160067482528703 

Zfp961 binding sites in: RLTR3_Mm

Gm21082 binding sites in: GA-rich
KAP1_ChIP enrichment: nan
H3K9me3_ChIP enrichment: nan 

Gm21082 binding sites in: (CAGG)n
KAP1_ChIP enrichment: nan
H3K9me3_ChIP enrichment: nan 

Gm21082 binding sites in: (CCTG)n
KAP1_ChIP enrichment: nan
H3K9me3_ChIP enrichment: nan 

Gm21082 binding sites in: (TCTCTG)n
KAP1_ChIP enrichment: nan
H3K9me3_ChIP enrichment: nan 

Gm21082 binding sites in: .
KAP1_ChIP enrichment: 3.9506352172396997
H3K9me3_ChIP enrichment: 0.9477066831498583 

['RLTR10', '.']
Zfp935 binding sites in: RLTR10
KAP1_ChIP enrichment: 3.578508417455774
H3K9me3_ChIP enrichment: 5.963652481966771 

Zfp935 binding sites in: .
KAP1_ChIP enrichment: 2.126099909673315
H3K9me3_ChIP enrichment: 1.5088759291723162 

['.']
Zfp600 binding sites in: .
KAP1_ChIP enrichment: 1.2281198236001136
H3K9me3_ChIP enrichment: 4.926941809542256 

['MTE2a', 'MTE2b', 'MTEa', 'MTEb', 'TSS', '.']
Zfp712 binding sites in: MTE2a
KAP1_ChIP enrichment: 2.976462760611248
H3K9me3_ChIP enrichment: 0.750588791