The aim of this notebook is to select conditions to fit all the data to the mutations. 
I will separate the exons into the two main groups: LS and HS
    

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.colors as mcolors
import re
import seaborn as sns
file1="./2024_01_06_data/Sharing_ES_PSI_Rosa.xlsx"
file2="./2024_02_23_data/ROSA_FULL_T1234_Feb_22_2024.xlsx"
exon_types=pd.read_excel(file1,sheet_name="TIER")
exon_types.columns=[x.lower() for x in exon_types.columns]
exon_types

Unnamed: 0,event,final_tier
0,HsaEX1045708,CR
1,HsaEX1011407,CR
2,HsaEX0043902,CR
3,HsaEX7001225,CS
4,HsaEX6063998,CS
5,HsaEX6041122,CS
6,HsaEX6093676,CS
7,HsaEX6024969,CS
8,HsaEX6040016,CS
9,HsaEX6077184,CS


In [2]:
df_alldata_=pd.read_excel(file2)


In [3]:
df_alldata_["LME"]

0        T1
1        T1
2        T1
3        T1
4        T1
         ..
23610    T4
23611    T4
23612    T4
23613    T4
23614    T4
Name: LME, Length: 23615, dtype: object

In [4]:
sum((df_alldata_["LME"]=="T1")&(df_alldata_["Cat"]=="U1cons")) #--> U1_consa

41

In [5]:
sum((df_alldata_["LME"]=="T4")&(df_alldata_["Cat"]=="U1cons"))#--> U1_consb

46

In [6]:
exon_pat=re.compile(r"(.*?)(HsaEX[0-9]*)(.*)")
df_alldata_[["gene_or_species","event", "mutation"]]=df_alldata_["VARIANT"].str.split(exon_pat,expand=True).drop(columns=[0,4]) 
df_alldata_["gene_or_species"]=df_alldata_["gene_or_species"].str.strip("-")

In [7]:
#doublecheck
np.all(df_alldata_["event"].str.startswith("HsaEX"))

True

In [8]:
df_alldata=df_alldata_

In [9]:
#categories ESE_ESS_3/5 have enhancer splicing silencer or splicing enhancers. Which one and which position is in mutation column
all_subcats=df_alldata["Cat"].copy()
for cat in ["ESE_ESS_5","ESE_ESS_3"]:
    idxs=np.where(df_alldata["Cat"]==cat)[0]
    all_subcats[idxs]=df_alldata["mutation"].values[idxs]
    print(np.unique(df_alldata["mutation"].values[idxs]))
    

['-ESE2-5' '-ESE3-5' '-ESS2-5' '-ESS3-5' '-rem-15-ESE2-5' '-rem-15-ESE3-5'
 '-rem-15-ESS2-5' '-rem-15-ESS3-5' '-rem-27-ESE2-5' '-rem-27-ESE3-5'
 '-rem-27-ESS2-5' '-rem-27-ESS3-5' '-rem-9-ESE2-5' '-rem-9-ESE3-5'
 '-rem-9-ESS2-5' '-rem-9-ESS3-5']
['-ESE2-3' '-ESE3-3' '-ESS2-3' '-ESS3-3' '-rem-15-ESE2-3' '-rem-15-ESE3-3'
 '-rem-15-ESS2-3' '-rem-15-ESS3-3' '-rem-27-ESE2-3' '-rem-27-ESE3-3'
 '-rem-27-ESS2-3' '-rem-27-ESS3-3' '-rem-9-ESE2-3' '-rem-9-ESE3-3'
 '-rem-9-ESS2-3' '-rem-9-ESS3-3']


In [10]:
#other categories also have merged mutations that are expected to increase and others to decrease. Split
pat1=re.compile(r"(Py-(?:weak[123]|stg[123])|BP_(?:weak[123]|stg[123])|U1cons|U1weak)") #?: is used for non-grouping
#pat1=re.compile(r"(Py-(?:weak[123]|stg[123])|BP_(?:weak[123]|stg[123]))") #?: is used for non-grouping

#pat2=re.compile(r"(BP_(?:weak|stg[123]))")
mystring="BP_weak2_AA_Py-stg3_AA__U1cons"
pat1.findall(mystring)

['BP_weak2', 'Py-stg3', 'U1cons']

In [11]:
#first I had doublechecked that the pattern above is good to split the categories
cats_to_split=["Py_Stg_Weak","BP_Stg_Weak","Comb_BP_Py", "Comb_Py_5ss", "Comb_BP_5ss","Comb_BP_Py_5ss"]

#print(cat,mutations.unique())
##Py_Stg_Weak:
#Py-weak[123]
#Py-stg[123]
#rem-[0-9]+ in combination with the previous 2
##BP_Stg_Weak:
#BP_weak[23]
#BP_stg[123]
#rem-[0-9]+ in combination with the previous 2
##Comb_BP_Py--> combinations of the individual mutationsi n BP_Stg_Weak and Py_Stg_Weak e.g. remi-15_Py-weak3_TT...BP_weak3
#--> pick those that have both strong, both weak, one of each, with or without removing
#Comb_BP_Py_5ss -> combination Py-weak/stg, BP_weak/stg, U1weak or U1cons with or without rem


for cat in cats_to_split:
    idxs_cat=np.where(df_alldata["Cat"]==cat)[0]
    mutations=df_alldata["mutation"].values[idxs_cat]
    subcats=[]
    print("=======cat",cat)
    #print(cat,np.unique(mutations))
   
    for mutation in mutations: #.unique():
        pats=pat1.findall(mutation)
        idxs=np.argsort([x[0] for x in pats])
        pats=[pats[idx] for idx in idxs] #sort pats by first letter so I always have BP first, Py second and U last

        combi=";".join(pats)
        subcats.append(combi)
    all_subcats[idxs_cat]=subcats
        #print(mutation,pats,combi)
        
idxs_U1consa=np.where((df_alldata["Cat"]=="U1cons")&(df_alldata["LME"]=="T1"))[0]
idxs_U1consb=np.where((df_alldata["Cat"]=="U1cons")&(df_alldata["LME"]=="T4"))[0]
all_subcats[idxs_U1consa]="U1cons_a"
all_subcats[idxs_U1consb]="U1cons_b"
df_alldata["Subcat"]=all_subcats     
    



In [13]:
df_alldata["Subcat"]

0                   ADD_in_EXON
1                   ADD_in_EXON
2                   ADD_in_EXON
3                   ADD_in_EXON
4                   ADD_in_EXON
                  ...          
23610                  TGC_Walk
23611    Comb_BP_Py_ESE_ESS_5ss
23612                    Evo_WT
23613              Deep_mut_5ss
23614              Deep_mut_5ss
Name: Subcat, Length: 23615, dtype: object

In [14]:
df_alldata[["VARIANT","Subcat"]]

Unnamed: 0,VARIANT,Subcat
0,ABI1-HsaEX0000641-add_0-12,ADD_in_EXON
1,ABI1-HsaEX0000641-add_0-15,ADD_in_EXON
2,ABI1-HsaEX0000641-add_0-18,ADD_in_EXON
3,ABI1-HsaEX0000641-add_0-21,ADD_in_EXON
4,ABI1-HsaEX0000641-add_0-24,ADD_in_EXON
...,...,...
23610,COL4A2-HsaEX6041122-TGC13,TGC_Walk
23611,COL4A2-HsaEX6041122_Py-stg3_CTCTGAT_tactaac_BP...,Comb_BP_Py_ESE_ESS_5ss
23612,MEF2A-HsaEX0038690-allMis1,Evo_WT
23613,SNAP25-HsaEX6077184-5ssA-2t,Deep_mut_5ss


In [15]:
df_alldata["Cat"].unique()

array(['ADD_in_EXON', 'addTCTC', 'AdML', 'AGHtoAGG', 'AGHtoGAC', 'BPmut',
       'BPshift', 'BPshift_TGC', 'TGCadd', 'IgM', 'REMinEXON', 'TGCrem',
       'TGCoptimzed', 'U1cons', 'U6cons', 'WT', 'mutTCTC', 'full',
       'full-int-hsa', 'full-ss5', 'Evo_WT', 'TGCmut', 'int-hsa',
       'int-HSAEX', 'intss5-hsa', 'intss5-HsaEX', 'ss5-hsa',
       '3_nts_mut_slide_1_nt', 'Deep_mut_5ss', 'GTtoGC5ss', 'Comb_BP_5ss',
       'U1weak', 'Comb_BP_Py_5ss', 'Comb_Py_5ss', 'Py_Stg_Weak',
       'Comb_BP_Py', 'Comb_BP_Py_ESE_ESS_5ss', 'Swap5ssHsatoHsa', 'SNP',
       'Deep_mut_3ss', 'TGC_Walk', 'BP_Stg_Weak', 'EvoShufflingMICHsa',
       'ESE_ESS_3', 'ESE_ESS_5'], dtype=object)

In [16]:
df_alldata["Subcat"].unique()

array(['ADD_in_EXON', 'addTCTC', 'AdML', 'AGHtoAGG', 'AGHtoGAC', 'BPmut',
       'BPshift', 'BPshift_TGC', 'TGCadd', 'IgM', 'REMinEXON', 'TGCrem',
       'TGCoptimzed', 'U1cons_a', 'U6cons', 'WT', 'mutTCTC', 'full',
       'full-int-hsa', 'full-ss5', 'Evo_WT', 'TGCmut', 'int-hsa',
       'int-HSAEX', 'intss5-hsa', 'intss5-HsaEX', 'ss5-hsa',
       '3_nts_mut_slide_1_nt', 'Deep_mut_5ss', 'GTtoGC5ss',
       'BP_weak3;U1weak', 'U1weak', 'BP_weak3;Py-weak1;U1weak',
       'Py-weak1;U1weak', 'BP_weak3;Py-weak2;U1weak', 'Py-weak2;U1weak',
       'Py-weak3', 'BP_weak3;Py-weak3', 'BP_weak3;Py-weak3;U1weak',
       'Comb_BP_Py_ESE_ESS_5ss', 'Py-weak3;U1weak', 'Swap5ssHsatoHsa',
       'BP_stg1;Py-stg3;U1cons', 'SNP', 'Deep_mut_3ss', 'TGC_Walk',
       'BP_weak3;Py-weak2', 'BP_weak3', 'Py-weak2', 'EvoShufflingMICHsa',
       'BP_weak3;Py-weak1', '-ESS2-3', 'Py-weak1', '-ESE2-5', '-ESE3-5',
       '-ESS2-5', '-ESS3-5', 'BP_weak2', '-ESE3-3', '-ESS3-3', 'Py-stg1',
       'BP_stg1;Py-stg1', 'BP_st

In [17]:
#The categories for the LS and HS mutants that we want to fit for the paper are:

#on 28/06 I had forgotten Py-stg3, adding as of July 1st
#Splitted U1cons_a and U1cons_b as of July 4th
categories_fit_HSLS=['WT','U1cons_a','U1cons_b','U6cons','GTtoGC5ss', 'BP_stg1', 'Py-stg3','BP_weak3', 'Py-weak3','BP_stg1;Py-stg3','BP_weak3;Py-weak3','BP_stg1;Py-stg3;U1cons','-ESS2-3','-ESE2-5']
for x in categories_fit_HSLS:
    if x in df_alldata["Subcat"].values:
        pass
    else:
        print(x, "not found")


In [18]:
events_HSLS=exon_types[exon_types["final_tier"].isin(["LS","HS"])]

keep_HSLS=df_alldata[(df_alldata["Subcat"].isin(categories_fit_HSLS))&(df_alldata["event"].isin(events_HSLS["event"].values))]
print(len(keep_HSLS))
keep_HSLS=pd.merge(keep_HSLS,events_HSLS,on="event",how="left")
print(len(keep_HSLS))
keep_HSLS=keep_HSLS[["VARIANT","gene_or_species","event","mutation","Subcat","GFP","LOW","MID","HIGH","final_tier"]]
keep_HSLS.head()

556
556


Unnamed: 0,VARIANT,gene_or_species,event,mutation,Subcat,GFP,LOW,MID,HIGH,final_tier
0,ABI1-HsaEX0000641-U1cons,ABI1,HsaEX0000641,-U1cons,U1cons_a,9.4,8.64,43.48,95.09,HS
1,ABI1-HsaEX0000641-U6cons,ABI1,HsaEX0000641,-U6cons,U6cons,16.24,16.11,45.8,97.37,HS
2,ABI1-HsaEX0000641-WT,ABI1,HsaEX0000641,-WT,WT,2.7,5.39,32.39,92.3,HS
3,ACVR2A-HsaEX0002213-U1cons,ACVR2A,HsaEX0002213,-U1cons,U1cons_a,3.79,4.99,38.16,92.12,LS
4,ACVR2A-HsaEX0002213-U6cons,ACVR2A,HsaEX0002213,-U6cons,U6cons,5.13,7.89,40.64,95.18,LS


In [19]:
#The CS that become regulatable upon reduction are:
CS_variants=["ARHGAP23-HsaEX7108093-rem-15","EPS15L1-HsaEX6093676-rem-27","EXOC7-HsaEX6024969-rem-15","POSTN-HsaEX6040016-rem-15","SNAP25-HsaEX6077184-rem-15","SNHG14-HsaEX1038166-rem-9"]
CS_events=[x.split("-")[1] for x in CS_variants]
CS_events

['HsaEX7108093',
 'HsaEX6093676',
 'HsaEX6024969',
 'HsaEX6040016',
 'HsaEX6077184',
 'HsaEX1038166']

In [20]:
keep_CS=df_alldata[df_alldata["event"].isin(CS_events)]
keep_CS=keep_CS[keep_CS["VARIANT"].str.contains("-rem")]
for v in keep_CS["VARIANT"].unique():
    print(v)

EPS15L1-HsaEX6093676-rem-12
EPS15L1-HsaEX6093676-rem-15
EPS15L1-HsaEX6093676-rem-18
EPS15L1-HsaEX6093676-rem-21
EPS15L1-HsaEX6093676-rem-24
EPS15L1-HsaEX6093676-rem-27
EPS15L1-HsaEX6093676-rem-3
EPS15L1-HsaEX6093676-rem-30
EPS15L1-HsaEX6093676-rem-33
EPS15L1-HsaEX6093676-rem-36
EPS15L1-HsaEX6093676-rem-6
EPS15L1-HsaEX6093676-rem-9
EXOC7-HsaEX6024969-rem-12
EXOC7-HsaEX6024969-rem-15
EXOC7-HsaEX6024969-rem-18
EXOC7-HsaEX6024969-rem-21
EXOC7-HsaEX6024969-rem-24
EXOC7-HsaEX6024969-rem-27
EXOC7-HsaEX6024969-rem-3
EXOC7-HsaEX6024969-rem-30
EXOC7-HsaEX6024969-rem-33
EXOC7-HsaEX6024969-rem-36
EXOC7-HsaEX6024969-rem-6
EXOC7-HsaEX6024969-rem-9
POSTN-HsaEX6040016-rem-12
POSTN-HsaEX6040016-rem-15
POSTN-HsaEX6040016-rem-18
POSTN-HsaEX6040016-rem-21
POSTN-HsaEX6040016-rem-24
POSTN-HsaEX6040016-rem-27
POSTN-HsaEX6040016-rem-3
POSTN-HsaEX6040016-rem-30
POSTN-HsaEX6040016-rem-33
POSTN-HsaEX6040016-rem-36
POSTN-HsaEX6040016-rem-6
POSTN-HsaEX6040016-rem-9
SNAP25-HsaEX6077184-rem-12
SNAP25-HsaEX6077184-re

In [21]:
keep_CS[keep_CS["VARIANT"]=="EPS15L1-HsaEX6093676-rem-27_Py-weak1"]

Unnamed: 0,VARIANT,GFP,HIGH,LOW,MID,INPUT,CTR,CU1,CU2,C1,...,ES_CU1,ES_CU2,ES_C1,ES_C2,ES_C3,ES_C4,gene_or_species,event,mutation,Subcat
19000,EPS15L1-HsaEX6093676-rem-27_Py-weak1,3.66,3.37,2.1,2.98,92,69,33,19,25,...,0.871164,0.483476,0.645923,0.494867,0.2363,0.47114,EPS15L1,HsaEX6093676,-rem-27_Py-weak1,Py-weak1


In [22]:
keep_CS_withmut=keep_CS.copy()
#keep_CS_withmut.to_csv("./2024_07_01_CSwmuttofit.csv",sep="\t")

In [23]:
keep_CS=keep_CS[[True if (x.startswith("-rem") or x=="WT" or x in categories_fit_HSLS) else False for x in keep_CS["Subcat"].values]]
keep_CS=keep_CS[["VARIANT","gene_or_species","event","mutation","Subcat","GFP","LOW","MID","HIGH"]]
keep_CS

Unnamed: 0,VARIANT,gene_or_species,event,mutation,Subcat,GFP,LOW,MID,HIGH
13928,POSTN-HsaEX6040016-rem-15,POSTN,HsaEX6040016,-rem-15,WT,7.92,11.22,22.95,58.02
13942,EPS15L1-HsaEX6093676-rem-27,EPS15L1,HsaEX6093676,-rem-27,WT,41.77,44.80,45.46,58.63
14594,EXOC7-HsaEX6024969-rem-15,EXOC7,HsaEX6024969,-rem-15,WT,15.10,17.75,38.17,76.27
14922,SNAP25-HsaEX6077184-rem-15,SNAP25,HsaEX6077184,-rem-15,WT,29.78,31.52,47.08,81.66
14957,ARHGAP23-HsaEX7108093-rem-15,ARHGAP23,HsaEX7108093,-rem-15,WT,61.68,63.62,64.00,82.29
...,...,...,...,...,...,...,...,...,...
23472,EXOC7-HsaEX6024969-rem-15_Py-stg3_CTCTGAA_tact...,EXOC7,HsaEX6024969,-rem-15_Py-stg3_CTCTGAA_tactaac_BP_stg1,BP_stg1;Py-stg3,98.25,98.95,99.08,99.49
23476,EPS15L1-HsaEX6093676-rem-27_Py-stg3_TTTTGAC_ta...,EPS15L1,HsaEX6093676,-rem-27_Py-stg3_TTTTGAC_tactaac_BP_stg1,BP_stg1;Py-stg3,97.95,99.11,98.78,99.50
23477,EPS15L1-HsaEX6093676-rem-27_Py-stg3_TTTTGAC_ta...,EPS15L1,HsaEX6093676,-rem-27_Py-stg3_TTTTGAC_tactaac_BP_stg1_U1cons,BP_stg1;Py-stg3;U1cons,97.95,99.11,98.78,99.50
23520,SNAP25-HsaEX6077184-rem-15_Py-stg3_TCATAAA_tac...,SNAP25,HsaEX6077184,-rem-15_Py-stg3_TCATAAA_tactaac_BP_stg1,BP_stg1;Py-stg3,97.90,98.90,98.97,99.56


In [24]:
#keep_HSLS.to_csv("./2024_06_28_HSLStofit.csv",sep="\t")
#keep_CS.to_csv("./2024_06_28_CStofit.csv",sep="\t")
keep_HSLS.to_csv("./2024_07_04_HSLStofit.csv",sep="\t") #wPystg3, U1consa, U1consb