In [9]:
import pandas as pd

In [10]:
def left_merge_intervals(bedfile, write=True):
    bed=pd.read_csv(bedfile, sep="\t", header=None)
    bed.columns=['chr','start','end']
    bed=bed.sort_values(by=bed.columns.values.tolist())
    bed.reset_index(drop=True, inplace=True)
    group=bed.groupby(['chr','start'], as_index=False)
    bed=group['end'].max()
    bed['count']=group.size()['size']
    if write:
        bed.to_csv('left_merge.bed', sep="\t", header=False, index=False)
    else:
        return(bed)

In [11]:
left_merge_intervals("cigars_lindley_deletions_gte12_lte600.bed")

In [12]:
bedfile="test.bed"


In [13]:

# This function merges consecutive intervals (in a sorted bed file) for which the start and end coordinates are close together (some fraction of the total interval length)
# For messy intervals this is dangerous, because they can grow until every chromosome/scaffold is a single interval
# However, for relatively simpler intervals it should hopefully work ok
# Several passes are necessary to allow this sort of interval 'growth' to settle out
def merge_overlapping_intervals(bedfile, fraction=0.1, passes=2):
    
    bed=left_merge_intervals(bedfile, write=False) 

    for j in range(passes):
        bed.reset_index(drop=True, inplace=True)
        for i in range(len(bed)-1):
            size=bed.loc[i,'end']-bed.loc[i,'start']+1
            startdiff=abs(bed.loc[i,'start'] - bed.loc[i+1, 'start'])
            enddiff=abs(bed.loc[i,'end'] - bed.loc[i+1, 'end'])

            if (startdiff <= size*fraction) and (enddiff<=size*fraction):
                bed.loc[i+1,'count']=bed.loc[i+1,'count']+bed.loc[i,'count']
                bed.loc[i+1,'start']=bed.loc[i,'start']
                bed.loc[i+1,'end']=max(bed.loc[i,'end'],bed.loc[i+1,'end'])
                bed=bed.drop(i)

    bed.to_csv('merge_overlapping_f'+str(fraction) + '.bed', sep="\t", header=False, index=False)

In [15]:
merge_overlapping_intervals(bedfile='/home/vcth2/rostoch_cigars_deletions_gte12_lte60.all.bed', passes=2)