# Enrichment Score for specific k-mer 

Input: 
1. binsize 
2. specific k-mer
3. Fragment Annotation and chromosome size file

Output:
1. 100 genomewide permutated randomly selected k-mer counts
2. summary of k-mer count 

In [None]:
import pandas as pd
import numpy as np
import datatable as dt
import random
import gc
import os,time
import math

In [None]:
# Loading
def LoadMatrixReader(filename, Chunksize, usecols,  sepstr="\t"):
    file_reader = pd.read_table(filename, sep=sepstr,chunksize=Chunksize, iterator=True, 
                                index_col=None, usecols = usecols)
    return(file_reader)


# Binsize
def Binsize(binsize, p_array):
    p_array = np.divide(p_array, binsize).astype("int")*binsize
    return(p_array)

# Remove the fragments with the same bins
def Deduplicate(df, binsize):
    df.loc[:, "posbin"] = Binsize(binsize, (df["start"].values + df["end"].values)/2 )
    # fragmetn bin相同的时候，去其中一个
    Dedup_df = df.groupby(["read_name","posbin"], as_index=False).first()
    Dedup_df = Dedup_df.reset_index()
    Dedup_df = Dedup_df.loc[:, ["read_name","chrom", "posbin"] ]
    return(Dedup_df)
 
def RandomKmers(inputBins, randombins, randomchroms):
    '''
    Calculate Kmer for randomselect bins
    '''
    startbin = min(inputBins)
    Kmerdist = [ pb-startbin for pb in inputBins ] # input kermer distance
    
    randKmer = {"chrom":[], "Kmer":[] }
    RandomkmerN = 0
    for chrom, posbin in zip(randomchroms, randombins):
        Kmer = [posbin+kdist for kdist in  Kmerdist ]
        if max(Kmer) <=  Genenomelen[chrom]:
            randKmer["chrom"].append(chrom)
            randKmer["Kmer"].append(Kmer)
            RandomkmerN  += 1
    print("Export %d Random Kmers"%(RandomkmerN) )
    return(randKmer)    

def FragmentFilter(df, ObsKmer):
    '''
    Filter Fragments with random select bins
    '''
    # 1  Filter select bins
    P = df["chrom"].str.cat(df["posbin"].astype(str) ,sep="_").isin(Allchrbins) 
    df = df.loc[P, :]
    # 2 Filter reads with fragment numbers larger than Kmer
    Kmer_frags = df.groupby("read_name")["chrom"].count()
    Pnum = Kmer_frags >= len(ObsKmer)
    df = df.loc[ df.read_name.isin(Kmer_frags[Pnum].index), : ]
    df = df .sort_values(by=["chrom","read_name", "posbin" ], ignore_index=True)
    return(df)

def CountKmers(df, ObsKmer):
    '''
    '''
    Kmerbins = len(ObsKmer)
    for (read_name, chrom), gdf in df.groupby(["read_name", "chrom"]):
        posbins = gdf.posbin.to_list()
        if len(posbins) == Kmerbins:
            posbins = sorted(posbins)
            kmerkey = "%s_%s"%(chrom, "_".join([str(pb) for pb in posbins])   )
            try:
                RandomKmerCount[kmerkey] += 1 # Random Kmer count, Kmer match
            except:
                pass
    return

In [None]:
# Kmers
binsize = 25000
ObsKmerChrom, ObsKmer, Obscount ="chr8", [132850000, 133200000, 133450000], 26


# Random Select Kmers
## genome position bins
chromlen_file = "/data1/ZJY/Pore-C/HiGlass/data/hg38.chromosomesize.csv"
chromlen_df = pd.read_csv(chromlen_file, sep="\t", header=None, index_col=None, names=["chrom", "len"])
Genenomeposbin = {"chrom":[],
              "posbin":[] }
Genenomelen = {}
for n, rowvalue in chromlen_df.iterrows():
    chrom = rowvalue["chrom"]
    if chrom == "chrY":
        continue
    s_pbs = binsize 
    posbins =  list( np.arange(10*binsize, rowvalue["len"]-10*binsize, binsize) ) # all position bins, except the telomere
    chroms = len(posbins) * [chrom]
    Genenomeposbin ["chrom"].extend( chroms )
    Genenomeposbin ["posbin"].extend( posbins )
    # Genome len 
    Genenomelen[chrom] = rowvalue["len"]
Genenomeposbin  = pd.DataFrame(Genenomeposbin )

# Random select bins
SelectNum = 100

rindex = random.sample(Genenomeposbin.index.to_list(), SelectNum)
randKmer = RandomKmers(ObsKmer, Genenomeposbin.loc[rindex, "posbin"].to_list() , Genenomeposbin.loc[rindex, "chrom"].to_list() )

## random chrombins and counts
Allchrbins = []
RandomKmerCount = {}
for chrom, posbins in zip( randKmer["chrom"], randKmer["Kmer"] ):
    chrbin = ["%s_%d"%(chrom, pb) for pb in posbins ]
    Allchrbins.extend(chrbin)
    
    kmerkey = "%s_%s"%(chrom, "_".join([str(pb) for pb in posbins])   )
    RandomKmerCount[kmerkey] = 0 # Random Kmer count

In [None]:
# Loading
filename = "/data1/ZJY/Pore-C/Analysis/NA12878/Merge_NA12878/vdFAnnotation/Merge_Align_Fragment_RvdF_5Reps.csv"
Chunksize = 10000000

usecols = ["read_name", "chrom", "start", "end"]
df_reader = LoadMatrixReader(filename, Chunksize,  usecols, ",")

for df in df_reader:
    print("Loading %d records"%len(df) )
    df = Deduplicate(df, binsize)
    df = FragmentFilter(df, ObsKmer)
    if len(df) >= 1:
        CountKmers(df, ObsKmer)

In [None]:
# Export
Count_df = pd.DataFrame({"Random_Kmers":RandomKmerCount.keys(),
                         "Count":RandomKmerCount.values()})
Exportdir = "/data1/ZJY/Pore-C/Analysis/Jupyter/Figures/Cluster/Fig6_7_0410/EnrichScore"
os.system('mkdir -p %s'%Exportdir)
kmercount_file = "%s/%s_%s_permutated_kmers_count.csv"%(Exportdir, ObsKmerChrom, "_".join([str(pb) for pb in ObsKmer]) )
Count_df.to_csv(kmercount_file, header=True, index=False, sep="\t" )

In [None]:
## Calculate  OE
summary_result = []
ExpertCount = Count_df["Count"].mean()
print(Count_df["Count"].describe() )
print("Expected K-mer counts:%.3f"%ExpertCount)
OE = Obscount/ExpertCount
print("O/E=%.3f"%OE)
n
summary_result = []
summary_result.append( str(Count_df["Count"].describe())  )
summary_result.append( "Observed_K-mer_Count=%d"%Obscount  )
summary_result.append( "O/E=%.3f"%OE  )
summaryfile = "%s/%s_%s_enrichment_score.summary.txt"%(Exportdir, ObsKmerChrom, "_".join([str(pb) for pb in ObsKmer]) )
with open(summaryfile, "w") as fileID:
    fileID.writelines( "\n".join(summary_result) )
fileID.close()