In [5]:
import pandas as pd

In [25]:
def generate_score(tf, idr_peaks, scores):
    '''
        This is a function that merge idr generated peaks and their corresponding ChIP-seq signal scores.
        @Parameters：
        tf: String, name of transcription factor, like "col2"
        idr_peaks: dataframe, the 1-3 fields of IDR output, containing chr, start, end.
        scores: dataframe, the 1-3,7 fields of narrowPeak file, containing chr, start, end, score.
    '''
    # Add new columns in peaks.df to store the score
    idr_peaks['score'] = 0.0
    idr_peaks['TF'] = tf
    idr_peaks = idr_peaks[['TF', 'chr', 'start', 'end']]
    
    # Iterate over each row in peaks.df
    for i, peak_row in idr_peaks.iterrows():
        # Filter scores.df for rows that match the criteria
        matching_scores = scores[
            (scores['chr'] == peak_row['chr']) & 
            (scores['start'] >= peak_row['start']) & 
            (scores['end'] <= peak_row['end'])
        ]
    
        # Calculate the score to be added
        if len(matching_scores) == 1:
            # Only one matching row, take the score directly
            score = matching_scores.iloc[0]['score']
        elif len(matching_scores) > 1:
            # Multiple matching rows, take the average score
            score = format(matching_scores['score'].mean(), '.5f')
        else:
            # No matching rows, set score to NaN or some default value
            score = float('nan')
        
        # Update the score column in peaks.df
        idr_peaks.at[i, 'score'] = score
    
    return(idr_peaks)

In [26]:
# Create a dataframe to save peaks and scores
peak_scores = pd.DataFrame()

for i in [2,7,8,13,18]:
    idr_name = 'col'+str(i)+'.peak'
    score_name = 'col'+str(i)+'.score'
    # Load intersected peaks
    idr_peaks = pd.read_csv(idr_name, sep='\t', header=None)
    idr_peaks.columns = ['chr', 'start', 'end']
    # Load peak scores
    scores = pd.read_csv('col2.score', sep='\t', header=None)
    scores.columns = ['chr', 'start', 'end', 'score']
    # Calculate peak scores 
    new_scores = generate_score('col'+str(i), idr_peaks, scores)
    peak_scores = pd.concat([peak_scores, new_scores],axis=0,ignore_index=True)

# Output file
peak_scores.to_csv("gene_score.csv", header=False, index=False)