# Imports & Modules

In [7]:
# General imports
import os
import sys
import pandas as pd
import scipy
pd.options.mode.chained_assignment = None  # default='warn'
import numpy as np
from tqdm import tqdm
sys.path.append('../')
# Other imports

tqdm.pandas()
import yaml
import json

# Figures imports
import matplotlib
from matplotlib.lines import Line2D   
import matplotlib.patches as mpatches
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from statannot import add_stat_annotation
import matplotlib.font_manager as font_manager
import string


# Font settings
font_dirs = ['/home/weber/Fonts', ]
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
font_list = font_manager.createFontList(font_files)
font_manager.fontManager.ttflist.extend(font_list)

pvalues_cutoff = json.load(open('config/config_files.json'))['pvalues_cutoff']
pvalues_cutoff = [[e, str(e)] for e in pvalues_cutoff]

import matplotlib.font_manager as font_manager

from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['Arial']
rcParams['font.weight'] = 'light'
rcParams['axes.facecolor'] = 'white'
rcParams['font.size'] = 18

The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.


# Load files

## Homo sapiens

### Update with count & write file

In [36]:
# H sapiens

hs_genes = pd.read_parquet("/gstock/GeneIso/V2/Genes.parquet")
hs_mrna = pd.read_parquet("/gstock/GeneIso/V2/mRNA.parquet")
hs_exons = pd.read_parquet("/gstock/GeneIso/V2/Exons.parquet")
hs_exons['ranges'] = hs_exons['Exon region start (bp)'].astype(str) + '-' + hs_exons['Exon region end (bp)'].astype(str)

hs_cds = pd.read_parquet("/gstock/GeneIso/V2/CDS.parquet")
hs_cds['ranges'] = hs_cds['Exon region start (bp)'].astype(str) + '-' + hs_cds['Exon region end (bp)'].astype(str)

# TODO > REMOVE
## Used to compute nb of completely different exon boundaries / gene => differ from exon events where overlap is possible
hs_cds = hs_cds.groupby('GeneID').progress_apply(overlap_apply)
hs_cds.to_parquet("/gstock/GeneIso/V2/CDS_update.parquet")
hs_exons = hs_exons.groupby('GeneID').progress_apply(overlap_apply)
hs_exons.to_parquet("/gstock/GeneIso/V2/Exons_update.parquet")

hs_five_UTR = pd.read_parquet("/gstock/GeneIso/V2/5_UTR.parquet")
hs_three_UTR = pd.read_parquet("/gstock/GeneIso/V2/3_UTR.parquet")

# TODO > REMOVE
hs_five_UTR['ranges']  = hs_five_UTR["5' UTR start"].astype(int).astype(str) + '-' +  hs_five_UTR["5' UTR end"].astype(int).astype(str)
hs_three_UTR['ranges'] = hs_three_UTR["3' UTR start"].astype(int).astype(str) + '-' + hs_three_UTR["3' UTR end"].astype(int).astype(str)
hs_five_UTR = hs_five_UTR.groupby('GeneID').progress_apply(overlap_apply)
hs_five_UTR.to_parquet("/gstock/GeneIso/V2/5_UTR_update.parquet")

hs_three_UTR = hs_three_UTR.groupby('GeneID').progress_apply(overlap_apply)
hs_three_UTR.to_parquet("/gstock/GeneIso/V2/3_UTR_update.parquet")

hs_introns = pd.read_parquet("/gstock/GeneIso/V2/Introns.parquet")
hs_introns.loc[hs_introns['Strand'] == 1, 'Ordinal_nb_inverted'] = hs_introns.loc[hs_introns['Strand'] == 1, 'Ordinal_nb_inverted'] + 1

100%|████████████████████████████████████████████████████████████████████████████| 13461/13461 [00:30<00:00, 440.79it/s]
100%|████████████████████████████████████████████████████████████████████████████| 13478/13478 [00:30<00:00, 438.89it/s]


## Mus Musculus

In [10]:
genes = pd.read_parquet("/gstock/GeneIso/V2/Genes_MM.parquet")
# mrna = pd.read_parquet("/gstock/GeneIso/V2/mRNA.parquet")
exons = pd.read_parquet("/gstock/GeneIso/V2/Exons_MM.parquet")
exons['ranges'] = exons['Exon region start (bp)'].astype(str) + '-' + exons['Exon region end (bp)'].astype(str)


cds = pd.read_parquet("/gstock/GeneIso/V2/CDS_MM.parquet")
cds['ranges'] = cds['Exon region start (bp)'].astype(str) + '-' + cds['Exon region end (bp)'].astype(str)

# TODO > REMOVE
cds = cds.groupby('GeneID').progress_apply(overlap_apply)
exons = exons.groupby('GeneID').progress_apply(overlap_apply)

cds.to_parquet("/gstock/GeneIso/V2/CDS_update_MM.parquet")
exons.to_parquet("/gstock/GeneIso/V2/Exons_update_MM.parquet")


five_UTR = pd.read_parquet("/gstock/GeneIso/V2/5_UTR_MM.parquet")
three_UTR = pd.read_parquet("/gstock/GeneIso/V2/3_UTR_MM.parquet")
introns = pd.read_parquet("/gstock/GeneIso/V2/Introns_MM.parquet")

# TODO > REMOVE
## Used to compute nb of completely different exon boundaries / gene => differ from exon events where overlap is possible

five_UTR['ranges'] = five_UTR['UTR_start'].astype(int).astype(str) + '-' + five_UTR['UTR_end'].astype(int).astype(str)
three_UTR['ranges'] = three_UTR['UTR_start'].astype(int).astype(str) + '-' + three_UTR['UTR_end'].astype(int).astype(str)
five_UTR = five_UTR.groupby('GeneID').progress_apply(overlap_apply)
three_UTR = three_UTR.groupby('GeneID').progress_apply(overlap_apply)

five_UTR.to_parquet("/gstock/GeneIso/V2/5_UTR_update_MM.parquet")
three_UTR.to_parquet("/gstock/GeneIso/V2/3_UTR_update_MM.parquet")




#TODO
# introns.loc[introns['Strand'] == 1, 'Ordinal_nb_inverted'] = introns.loc[introns['Strand'] == 1, 'Ordinal_nb_inverted'] + 1

100%|████████████████████████████████████████████████████████████████████████████| 12154/12154 [00:27<00:00, 437.20it/s]
100%|████████████████████████████████████████████████████████████████████████████| 12154/12154 [00:24<00:00, 504.86it/s]
100%|████████████████████████████████████████████████████████████████████████████| 12099/12099 [00:29<00:00, 404.02it/s]
100%|████████████████████████████████████████████████████████████████████████████| 12130/12130 [00:29<00:00, 404.95it/s]


# Descritive stats

## Genes

In [229]:
genes_new = genes[['Miso_siso', 'GeneID', 'Gene name', 'ensembl_gene_id', 'chromosome_name', 'start_position', 'end_position']].drop_duplicates()
genes_new['Gene_length'] = genes_new['end_position'] - genes_new['start_position']

In [132]:
describe = pd.concat(
    [
        hs_genes.groupby('Miso_siso')['Gene_length'].describe().round(0).astype(int),
        genes_new.groupby('Miso_siso')['Gene_length'].describe().round(0).astype(int),
    ],
    axis=0
)
describe.index = pd.MultiIndex.from_product([['HS', 'MM'], ['MISOG', 'SISOG']])
describe

Unnamed: 0,Unnamed: 1,count,mean,std,min,25%,50%,75%,max
HS,MISOG,9826,82044,143420,704,15084,36574,89612,2473538
HS,SISOG,3675,51655,90223,489,9536,23037,56162,1899593
MM,MISOG,8933,66287,130727,616,11958,27996,67630,2960898
MM,SISOG,3221,43801,83590,566,8437,19336,45084,2058378


## mRNA

In [231]:
mrna = genes[['Miso_siso', 'GeneID', 'Gene name', 'ensembl_gene_id', 'ensembl_transcript_id', 'chromosome_name', 'transcript_start', 'transcript_end', 'transcript_length']].drop_duplicates()

In [44]:
mrna_describe = pd.concat(
    [
        hs_mrna.groupby('Miso_siso')['Transcript length (including UTRs and CDS)'].describe().round(0).astype(int).T,
        mrna.groupby('Miso_siso')['transcript_length'].describe().round(0).astype(int).T,
    ],
    axis=1
)
mrna_describe.columns = pd.MultiIndex.from_product([['HS', 'MM'], ['MISOG', 'SISOG']])
mrna_describe

Unnamed: 0_level_0,HS,HS,MM,MM
Unnamed: 0_level_1,MISOG,SISOG,MISOG,SISOG
count,31111,3675,22589,5314
mean,3070,3728,3145,3116
std,2361,2772,2337,2297
min,267,303,132,215
25%,1537,1805,1652,1577
50%,2424,2979,2673,2578
75%,3925,4865,4058,3961
max,34626,30609,123179,24060


## Exons

### Length

In [70]:
hs_exons = hs_exons.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Exon region start (bp)', 'Exon region end (bp)', 'Length'])
hs_exons = hs_exons.loc[hs_exons['Length'] > 0]

exons = exons.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Exon region start (bp)', 'Exon region end (bp)', 'Length'])
exons = exons.loc[exons['Length'] > 0]

describe = pd.concat(
    [
        hs_exons.groupby('Miso_siso')['Length'].describe().round(0).astype(int).T,
        exons.groupby('Miso_siso')['Length'].describe().round(0).astype(int).T,
    ],
    axis=1
)
describe.columns = pd.MultiIndex.from_product([['HS', 'MM'], ['MISOG', 'SISOG']])
describe

Unnamed: 0_level_0,HS,HS,MM,MM
Unnamed: 0_level_1,MISOG,SISOG,MISOG,SISOG
count,183827,36173,137283,37128
mean,409,378,338,364
std,922,962,789,777
min,1,2,1,2
25%,95,97,94,99
50%,145,137,137,144
75%,269,203,220,229
max,33289,27302,122801,20715


### Nb

In [134]:
hs_exons = hs_exons.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Exon region start (bp)', 'Exon region end (bp)', 'Length'])
hs_exons = hs_exons.loc[hs_exons['Length'] > 0]

exons = exons.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Exon region start (bp)', 'Exon region end (bp)', 'Length'])
exons = exons.loc[exons['Length'] > 0]

describe = pd.concat(
    [
        hs_exons.groupby(['Miso_siso', 'GeneID'])['Exon region start (bp)'].nunique().groupby('Miso_siso').describe().round(0).astype(int).T,
        exons.groupby(['Miso_siso', 'GeneID'])['Exon region start (bp)'].nunique().groupby('Miso_siso').describe().round(0).astype(int).T,
    ],
    axis=1
)
describe.columns = pd.MultiIndex.from_product([['HS', 'MM'], ['MISOG', 'SISOG']])
describe

Unnamed: 0_level_0,HS,HS,MM,MM
Unnamed: 0_level_1,MISOG,SISOG,MISOG,SISOG
count,9826,3675,8933,3221
mean,16,10,14,11
std,11,10,10,10
min,2,2,1,1
25%,9,4,7,5
50%,13,7,12,8
75%,20,12,18,14
max,161,107,149,109


## CDS

In [71]:
hs_cds = hs_cds.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Exon region start (bp)', 'Exon region end (bp)', 'Length'])
hs_cds = hs_cds.loc[hs_cds['Length'] > 0]

cds = cds.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Exon region start (bp)', 'Exon region end (bp)', 'Length'])
cds = cds.loc[cds['Length'] > 0]

describe = pd.concat(
    [
        hs_cds.groupby('Miso_siso')['Length'].describe().round(0).astype(int).T,
        cds.groupby('Miso_siso')['Length'].describe().round(0).astype(int).T,
    ],
    axis=1
)
describe.columns = pd.MultiIndex.from_product([['HS', 'MM'], ['MISOG', 'SISOG']])
describe

Unnamed: 0_level_0,HS,HS,MM,MM
Unnamed: 0_level_1,MISOG,SISOG,MISOG,SISOG
count,161242,34776,125843,35075
mean,167,179,162,180
std,273,315,245,283
min,1,1,1,1
25%,77,89,81,88
50%,117,127,119,127
75%,168,174,167,176
max,17330,15792,16625,9658


## Introns length

In [72]:
hs_introns = hs_introns.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Introns'])
hs_introns = hs_introns.loc[hs_introns['Length'] > 0]

introns = introns.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Introns'])
introns = introns.loc[introns['Length'] > 0]

describe = pd.concat(
    [
        hs_introns.groupby('Miso_siso')['Length'].describe().round(0).astype(int).T,
        introns.groupby('Miso_siso')['Length'].describe().round(0).astype(int).T,
    ],
    axis=1
)
describe.columns = pd.MultiIndex.from_product([['HS', 'MM'], ['MISOG', 'SISOG']])
describe

Unnamed: 0_level_0,HS,HS,MM,MM
Unnamed: 0_level_1,MISOG,SISOG,MISOG,SISOG
count,142425,32498,114998,31460
mean,7703,5005,5872,4428
std,23815,14920,22222,14027
min,1,1,1,1
25%,581,458,482,420
50%,1818,1424,1442,1275
75%,5433,3807,3780,3211
max,1160410,654925,2908815,536970


## Introns length according ordinal

In [126]:
hs_introns = pd.read_parquet("/gstock/GeneIso/V2/Introns.parquet")
hs_introns.loc[hs_introns['Strand'] == 1, 'Ordinal_nb_inverted'] = hs_introns.loc[hs_introns['Strand'] == 1, 'Ordinal_nb_inverted'] + 1
hs_introns = hs_introns.loc[hs_introns['Ordinal_nb'] <= 5] 

introns = pd.read_parquet("/gstock/GeneIso/V2/Introns_MM.parquet")
introns.loc[introns['Strand'] == 1, 'Ordinal_nb_inverted'] = introns.loc[introns['Strand'] == 1, 'Ordinal_nb_inverted'] + 1
introns = introns.loc[introns['Ordinal_nb'] <= 5] 

pd.concat(
    [
        hs_introns.groupby(['Miso_siso', 'Ordinal_nb'])['Length'].describe().round(0).astype(int),
        introns.groupby(['Miso_siso', 'Ordinal_nb'])['Length'].describe().round(0).astype(int)

    ], axis=0
).T

Miso_siso,Miso,Miso,Miso,Miso,Miso,Siso,Siso,Siso,Siso,Siso,Miso,Miso,Miso,Miso,Miso,Siso,Siso,Siso,Siso,Siso
Ordinal_nb,1.0,2.0,3.0,4.0,5.0,1.0,2.0,3.0,4.0,5.0,1.0.1,2.0.1,3.0.1,4.0.1,5.0.1,1.0.1,2.0.1,3.0.1,4.0.1,5.0.1
count,43680,42115,39113,35472,31887,3675,3213,2854,2532,2244,22506,21827,20503,18884,17276,5275,4719,4232,3753,3317
mean,15887,11778,8735,6539,5543,13167,8010,5541,4339,3735,16394,10488,7634,5518,4659,11075,7016,4772,3712,3331
std,35141,31602,25903,20442,18006,29571,22329,14515,10874,8569,40173,35478,25013,17065,16075,26212,19838,12658,10007,8451
min,29,0,1,0,0,1,68,67,1,69,1,0,4,0,0,29,0,3,0,64
25%,987,802,642,579,525,1053,659,536,464,390,1000,727,569,523,457,918,635,502,432,381
50%,3998,2561,2070,1760,1643,3755,2046,1616,1434,1368,3896,2303,1755,1548,1393,3157,1813,1478,1257,1274
75%,15422,8326,6174,4778,4227,12537,5777,4386,3696,3490,14785,7098,4847,3851,3422,10344,4687,3718,3113,2853
max,956395,1160410,772518,1096452,1047317,654925,367620,253993,222578,130369,940551,2908815,785804,551318,551318,536970,362814,171562,224742,141021


## 5' UTR length

In [73]:
hs_five_UTR = hs_five_UTR.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Exon region start (bp)', 'Exon region end (bp)'])
hs_five_UTR = hs_five_UTR.loc[hs_five_UTR['Length'] > 0]

five_UTR = five_UTR.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Exon region start (bp)', 'Exon region end (bp)'])
five_UTR = five_UTR.loc[five_UTR['Length'] > 0]

describe = pd.concat(
    [
        hs_five_UTR.groupby('Miso_siso')['Length'].describe().round(0).astype(int).T,
        five_UTR.groupby('Miso_siso')['Length'].describe().round(0).astype(int).T,
    ],
    axis=1
)
describe.columns = pd.MultiIndex.from_product([['HS', 'MM'], ['MISOG', 'SISOG']])
describe

Unnamed: 0_level_0,HS,HS,MM,MM
Unnamed: 0_level_1,MISOG,SISOG,MISOG,SISOG
count,53935,4816,27771,6633
mean,156,143,154,166
std,221,169,192,227
min,1,1,1,1
25%,51,42,51,54
50%,100,92,104,108
75%,184,178,192,205
max,6565,2222,8961,7606


## 3' UTR length

In [74]:
hs_three_UTR = hs_three_UTR.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Exon region start (bp)', 'Exon region end (bp)'])
hs_three_UTR = hs_three_UTR.loc[hs_three_UTR['Length'] > 0]

three_UTR = three_UTR.drop_duplicates(subset=['Miso_siso', 'GeneID', 'Exon region start (bp)', 'Exon region end (bp)'])
three_UTR = three_UTR.loc[three_UTR['Length'] > 0]

describe = pd.concat(
    [
        three_UTR.groupby('Miso_siso')['Length'].describe().round(0).astype(int).T,
        three_UTR.groupby('Miso_siso')['Length'].describe().round(0).astype(int).T,
    ],
    axis=1
)
describe.columns = pd.MultiIndex.from_product([['HS', 'MM'], ['MISOG', 'SISOG']])
describe

Unnamed: 0_level_0,HS,HS,MM,MM
Unnamed: 0_level_1,MISOG,SISOG,MISOG,SISOG
count,20869,5546,20869,5546
mean,1070,1130,1070,1130
std,1579,1411,1579,1411
min,1,1,1,1
25%,201,202,201,202
50%,594,592,594,592
75%,1455,1567,1455,1567
max,122582,18864,122582,18864


# MISOG SISOG category comparison for HS & MM

In [204]:
# Count nb of transcripts per MM gene
new_count = genes.groupby('ensembl_gene_id')['ensembl_transcript_id'].nunique().rename('transcript_count_mm').reset_index()
new_count.loc[new_count['transcript_count_mm'] > 1, 'Miso_siso_mm'] = 'Miso'
new_count.loc[new_count['transcript_count_mm'] == 1, 'Miso_siso_mm'] = 'Siso'

# Merge with previous DF and retrieve Miso Siso HS status
new_count = pd.merge(genes, new_count,on='ensembl_gene_id')[['GeneID', 'ensembl_gene_id', 'Miso_siso', 'Miso_siso_mm']].drop_duplicates()
# new_count.groupby('Miso_siso')['Miso_siso_mm'].value_counts()
new_count['Miso_siso_full'] = 'HS:' + new_count['Miso_siso'] + '-MM:' + new_count['Miso_siso_mm']
new_count

Unnamed: 0,GeneID,ensembl_gene_id,Miso_siso,Miso_siso_mm,Miso_siso_full
0,ENSG00000000003,ENSMUSG00000067377,Miso,Siso,HS:Miso-MM:Siso
1,ENSG00000000005,ENSMUSG00000031250,Siso,Siso,HS:Siso-MM:Siso
2,ENSG00000000419,ENSMUSG00000078919,Miso,Miso,HS:Miso-MM:Miso
4,ENSG00000000457,ENSMUSG00000026584,Miso,Miso,HS:Miso-MM:Miso
7,ENSG00000000460,ENSMUSG00000041406,Miso,Miso,HS:Miso-MM:Miso
...,...,...,...,...,...
27895,ENSG00000284194,ENSMUSG00000091780,Miso,Miso,HS:Miso-MM:Miso
27897,ENSG00000284308,ENSMUSG00000030030,Miso,Miso,HS:Miso-MM:Miso
27899,ENSG00000284753,ENSMUSG00000115219,Siso,Miso,HS:Siso-MM:Miso
27901,ENSG00000284770,ENSMUSG00000039233,Miso,Siso,HS:Miso-MM:Siso


# MM ENCODE Expression

In [194]:
encode_mm = pd.read_csv('/gstock/biolo_datasets/ENCODE/ENCODE_SRS_TPM_summary.tsv.gz', compression='gzip', sep='\t') 

# Ratio = % of 535 experiments of short read seq where ENST is expressed > 0.1 TPM
## Like GTEx, cutoff = expression > 0.1 TPM in >= 20% exp
encode_mm = encode_mm.loc[encode_mm['TPM_ratio'] >= 0.2]
encode_mm

Unnamed: 0,ENST,TPM_ratio
0,ENSMUST00000000001,1.000000
3,ENSMUST00000000028,0.874766
4,ENSMUST00000000033,0.500935
5,ENSMUST00000000049,0.297196
6,ENSMUST00000000058,0.912150
...,...,...
141759,ENSMUST00000238186,0.289720
141762,ENSMUST00000238189,0.596262
141763,ENSMUST00000238190,0.988785
141765,ENSMUST00000238192,0.218692


## Merge 

In [196]:
# Merge MM genes with MM genes expressed 

new_genes_mm = pd.merge(
    genes,
    encode_mm.rename({'ENST': 'ensembl_transcript_id'}, axis=1),
    on='ensembl_transcript_id'
)

## New count expressed

In [203]:
# Same as before but to retieve expressed genes

new_count = new_genes_mm.groupby('ensembl_gene_id')['ensembl_transcript_id'].nunique().rename('transcript_count_mm').reset_index()
new_count.loc[new_count['transcript_count_mm'] > 1, 'Miso_siso_mm'] = 'Miso'
new_count.loc[new_count['transcript_count_mm'] == 1, 'Miso_siso_mm'] = 'Siso'
new_count = pd.merge(genes, new_count,on='ensembl_gene_id')[['GeneID', 'ensembl_gene_id', 'Miso_siso', 'Miso_siso_mm']].drop_duplicates()
# new_count.groupby('Miso_siso')['Miso_siso_mm'].value_counts()
new_count['Miso_siso_full'] = 'HS:' + new_count['Miso_siso'] + '-MM:' + new_count['Miso_siso_mm']
new_count_expr = new_count.copy()

In [206]:
# Before ENCODE expr
new_count

Unnamed: 0,GeneID,ensembl_gene_id,Miso_siso,Miso_siso_mm,Miso_siso_full
0,ENSG00000000003,ENSMUSG00000067377,Miso,Siso,HS:Miso-MM:Siso
1,ENSG00000000005,ENSMUSG00000031250,Siso,Siso,HS:Siso-MM:Siso
2,ENSG00000000419,ENSMUSG00000078919,Miso,Miso,HS:Miso-MM:Miso
4,ENSG00000000457,ENSMUSG00000026584,Miso,Miso,HS:Miso-MM:Miso
7,ENSG00000000460,ENSMUSG00000041406,Miso,Miso,HS:Miso-MM:Miso
...,...,...,...,...,...
27895,ENSG00000284194,ENSMUSG00000091780,Miso,Miso,HS:Miso-MM:Miso
27897,ENSG00000284308,ENSMUSG00000030030,Miso,Miso,HS:Miso-MM:Miso
27899,ENSG00000284753,ENSMUSG00000115219,Siso,Miso,HS:Siso-MM:Miso
27901,ENSG00000284770,ENSMUSG00000039233,Miso,Siso,HS:Miso-MM:Siso


In [205]:
# After ENCODE expr
new_count_expr

Unnamed: 0,GeneID,ensembl_gene_id,Miso_siso,Miso_siso_mm,Miso_siso_full
0,ENSG00000000003,ENSMUSG00000067377,Miso,Siso,HS:Miso-MM:Siso
1,ENSG00000000005,ENSMUSG00000031250,Siso,Siso,HS:Siso-MM:Siso
2,ENSG00000000419,ENSMUSG00000078919,Miso,Miso,HS:Miso-MM:Miso
4,ENSG00000000457,ENSMUSG00000026584,Miso,Miso,HS:Miso-MM:Miso
7,ENSG00000000460,ENSMUSG00000041406,Miso,Miso,HS:Miso-MM:Miso
...,...,...,...,...,...
27024,ENSG00000284194,ENSMUSG00000091780,Miso,Miso,HS:Miso-MM:Miso
27026,ENSG00000284308,ENSMUSG00000030030,Miso,Miso,HS:Miso-MM:Miso
27028,ENSG00000284753,ENSMUSG00000115219,Siso,Miso,HS:Siso-MM:Miso
27030,ENSG00000284770,ENSMUSG00000039233,Miso,Siso,HS:Miso-MM:Siso


In [226]:
# Merge to compare MM genes that change of status

merge_merge = pd.merge(
    new_count[['GeneID', 'ensembl_gene_id', 'Miso_siso_mm']],
    new_count_expr[['GeneID', 'ensembl_gene_id', 'Miso_siso_mm']].rename({'Miso_siso_mm' : 'Miso_siso_mm_expr'}, axis=1),
    on=['GeneID', 'ensembl_gene_id']
)

# Export to reload later
merge_merge.to_csv('/gstock/GeneIso/V2_with_MM_expression/misog_sisog_mm_expr.tsv.gz', compression='gzip', sep='\t', index=False)
merge_merge

Unnamed: 0,GeneID,ensembl_gene_id,Miso_siso_mm,Miso_siso_mm_expr
0,ENSG00000000003,ENSMUSG00000067377,Siso,Siso
1,ENSG00000000005,ENSMUSG00000031250,Siso,Siso
2,ENSG00000000419,ENSMUSG00000078919,Miso,Miso
3,ENSG00000000457,ENSMUSG00000026584,Miso,Miso
4,ENSG00000000460,ENSMUSG00000041406,Miso,Miso
...,...,...,...,...
11688,ENSG00000284194,ENSMUSG00000091780,Miso,Miso
11689,ENSG00000284308,ENSMUSG00000030030,Miso,Miso
11690,ENSG00000284753,ENSMUSG00000115219,Miso,Miso
11691,ENSG00000284770,ENSMUSG00000039233,Siso,Siso


In [236]:
# Identify MM genes that change of status before and after ENCODE expr
changing_status_mm_genes = pd.merge(
    merge_merge.loc[(merge_merge['Miso_siso_mm'] == "Miso") & (merge_merge['Miso_siso_mm_expr'] == "Siso")],
    genes[['GeneID', 'Gene name']],
    on='GeneID'
).drop_duplicates()
changing_status_mm_genes

Unnamed: 0,GeneID,ensembl_gene_id,Miso_siso_mm,Miso_siso_mm_expr,Gene name
0,ENSG00000004848,ENSMUSG00000035277,Miso,Siso,ARX
2,ENSG00000005381,ENSMUSG00000009350,Miso,Siso,MPO
4,ENSG00000006128,ENSMUSG00000061762,Miso,Siso,TAC1
7,ENSG00000006611,ENSMUSG00000030838,Miso,Siso,USH1C
12,ENSG00000007047,ENSMUSG00000030397,Miso,Siso,MARK4
...,...,...,...,...,...
1472,ENSG00000268089,ENSMUSG00000031344,Miso,Siso,GABRQ
1474,ENSG00000272514,ENSMUSG00000028294,Miso,Siso,CFAP206
1476,ENSG00000273079,ENSMUSG00000030209,Miso,Siso,GRIN2B
1479,ENSG00000282608,ENSMUSG00000000562,Miso,Siso,ADORA3


## Count of genes that change after expression in MM

In [238]:
# MM genes before & after expr
merge_merge.groupby(['Miso_siso_mm'])['Miso_siso_mm_expr'].value_counts()

Miso_siso_mm  Miso_siso_mm_expr
Miso          Miso                 6608
              Siso                  588
Siso          Siso                 4497
Name: Miso_siso_mm_expr, dtype: int64

In [198]:
# Final distribution for Miso Siso HS & Miso Siso MM after expr
new_count.groupby('Miso_siso')['Miso_siso_mm'].value_counts()

Miso_siso  Miso_siso_mm
Miso       Miso            5490
           Siso            3213
Siso       Siso            1872
           Miso            1118
Name: Miso_siso_mm, dtype: int64

In [201]:
# Prepare for export to process with stringdb command of laetitia

new_count_to_export = pd.merge(new_count, genes[['GeneID', 'Gene name', 'mmusculus_homolog_associated_gene_name']].drop_duplicates(), on='GeneID')

for condition in new_count_to_export.Miso_siso_full.unique().tolist():
    print(condition)
    print('/gstock/GeneIso/V2_with_MM_expression/Gene_lists_STRING/{}_HS_gene_names.txt'.format(condition.replace(':', '-')))
    new_count_to_export.loc[new_count_to_export['Miso_siso_full'] == condition, 'Gene name'].to_csv('/gstock/GeneIso/V2_with_MM_expression/Gene_lists_STRING/{}_HS_gene_names.txt'.format(condition.replace(':', '-')), index=False, header=False)
#     new_count_to_export.loc[new_count_to_export['Miso_siso_full'] == condition, 'mmusculus_homolog_associated_gene_name'].to_csv('/gstock/GeneIso/V2/Gene_lists_STRING/{}_MM_gene_names.txt'.format(condition.replace(':', '-')), index=False, header=False)

HS:Miso-MM:Siso
/gstock/GeneIso/V2_with_MM_expression/Gene_lists_STRING/HS-Miso-MM-Siso_HS_gene_names.txt
HS:Siso-MM:Siso
/gstock/GeneIso/V2_with_MM_expression/Gene_lists_STRING/HS-Siso-MM-Siso_HS_gene_names.txt
HS:Miso-MM:Miso
/gstock/GeneIso/V2_with_MM_expression/Gene_lists_STRING/HS-Miso-MM-Miso_HS_gene_names.txt
HS:Siso-MM:Miso
/gstock/GeneIso/V2_with_MM_expression/Gene_lists_STRING/HS-Siso-MM-Miso_HS_gene_names.txt
