In [8]:
import pandas as pd
import json

In [9]:
#Sticky end ligation frequency matrix from Potapov et al 2018 
# https://pubmed.ncbi.nlm.nih.gov/30335370/
sticky_end_file = "Sticky_end_ligation_matrix.xlsx"
sticky_end_output_file = "Ligation_fidelity_dictionary.json"

sticky_end_df = pd.read_excel(sticky_end_file, index_col="Overhang")
sticky_end_df.head()

Unnamed: 0_level_0,AAAA,AAAC,AAAG,AAAT,AACA,AACC,AACG,AACT,AAGA,AAGC,...,TTCG,TTCT,TTGA,TTGC,TTGG,TTGT,TTTA,TTTC,TTTG,TTTT
Overhang,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TTTT,3804,4,96,57,0,0,1,1,5,0,...,0,0,0,0,0,0,1,0,0,0
GTTT,2,5322,3,565,1,44,2,6,0,27,...,0,0,0,0,0,0,0,2,0,0
CTTT,3,2,4742,3,1,0,59,0,0,0,...,0,0,0,0,0,0,0,0,1,0
ATTT,1,3,2,5152,0,0,0,5,0,1,...,0,0,0,0,0,0,0,0,0,0
TGTT,0,0,0,0,5482,11,768,322,5,1,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Create an empty dictionary to store the results
hf_low = {}
hf_medium = {}
hf_high = {}

low = 0.8   #determines the threshold to filter misligations, low means will only check those known misligations > 20% of the time
med = 0.9   #medium means will check known misligations that happen >10% of the time
high = 0.99 #high means will check misligations that happen >1% of the time

# Iterate through columns and find indices where values are greater than 10% of the maximum
for column in sticky_end_df.columns[1:]:  # Exclude the 'Overhang' column
    max_value = sticky_end_df[column].max()
    low_threshold = (1 - low) * max_value
    medium_threshold = (1 - med) * max_value
    high_threshold = (1 - high) * max_value
    max_threshold = 0.9 * max_value
    
    low_indices = sticky_end_df[(sticky_end_df[column] > low_threshold) & (sticky_end_df[column] < max_threshold)].index.tolist()
    medium_indices = sticky_end_df[(sticky_end_df[column] > medium_threshold) & (sticky_end_df[column] < max_threshold)].index.tolist()
    high_indices = sticky_end_df[(sticky_end_df[column] > high_threshold) & (sticky_end_df[column] < max_threshold)].index.tolist()

    hf_low[column] = low_indices
    hf_medium[column] = medium_indices
    hf_high[column] = high_indices

hf_high

{'AAAC': ['GTTG'],
 'AAAG': ['TTTT', 'CTTG', 'CTTA'],
 'AAAT': ['TTTT', 'GTTT'],
 'AACA': [],
 'AACC': ['GGTG'],
 'AACG': ['CTTT', 'TGTT', 'CGTG', 'CGTC', 'CGTA'],
 'AACT': ['TGTT', 'GGTT'],
 'AAGA': [],
 'AAGC': ['GGTT', 'GCTG'],
 'AAGG': ['CGTT', 'TCTT', 'CCTG'],
 'AAGT': ['GCTT'],
 'AATA': [],
 'AATC': ['GGTT'],
 'AATG': ['TATT', 'CATG', 'CATA'],
 'AATT': ['TATT', 'GATT'],
 'ACAA': [],
 'ACAC': ['GGGT', 'GTGG', 'GTGC', 'GTGA'],
 'ACAG': ['TTGT', 'CTGG', 'CTGC'],
 'ACAT': ['GTGT', 'ATGG', 'ATGC'],
 'ACCA': [],
 'ACCC': ['GGGG', 'GGGC'],
 'ACCG': ['TGGT', 'CGGG', 'CGGC'],
 'ACCT': ['TGGT', 'GGGT', 'AGGG'],
 'ACGA': [],
 'ACGC': ['GTGT', 'GGGT', 'GCGG', 'GCGC'],
 'ACGG': ['CTGT', 'TCGT', 'ACGT', 'CCGG', 'CCGC', 'CCTA', 'CCGA'],
 'ACGT': ['AGGT', 'TCGT', 'GCGT', 'ACGG'],
 'ACTA': ['TGGT'],
 'ACTC': ['GTGT', 'GGGT', 'GCGT', 'GAGG', 'GAGC'],
 'ACTG': ['CTGT', 'CGGT', 'TAGT', 'CAGG', 'CAGC'],
 'ACTT': ['AGGT', 'TAGT', 'GAGT'],
 'AGAA': [],
 'AGAC': ['GTTT', 'GTGT', 'GGCT', 'GTAT', 'GTCG', 

In [11]:
combined_dict = {"high": hf_high, "medium": hf_medium, "low": hf_low}

# Write the combined dictionary to the file in JSON format
with open(sticky_end_output_file, "w") as json_file:
    json.dump(combined_dict, json_file)

In [13]:
df = pd.read_json(sticky_end_output_file)
df["high"]

AAAC                [GTTG]
AAAG    [TTTT, CTTG, CTTA]
AAAT          [TTTT, GTTT]
AACA                    []
AACC                [GGTG]
               ...        
TTGT          [ACAG, GCAA]
TTTA                [TAAG]
TTTC          [GAAT, GAAG]
TTTG          [CAAT, CAAG]
TTTT          [AAAT, AAAG]
Name: high, Length: 255, dtype: object