# Intron and Exon Gene Analysis

In [1]:
#### Packages
import pandas as pd

In [17]:
#### Load data
mart_file = pd.read_csv("../../shared/WBcel235_exon_mart.txt", sep="\t")
expr_file_ex = pd.read_csv("../processed/CUS000001_ExonicCounts.txt", sep="\t")
expr_file_in = pd.read_csv("../processed/CUS000001_IntronicCounts.txt", sep="\t")
eisa_file = pd.read_csv("../processed/CUS000001_eisaDE_D1_control.D1_30minHS.txt", sep="\t")

expr_file_in.head()

Unnamed: 0,D1_30minHS_rep1,D1_30minHS_rep2,D1_30minHS_rep3,D1_30minHS_rep4,D1_control_rep1,D1_control_rep2,D1_control_rep3,D1_control_rep4
WBGene00000001,0,0,0,0,0,0,0,1
WBGene00000002,0,0,0,0,0,0,0,0
WBGene00000003,0,0,0,0,0,0,0,0
WBGene00000006,0,0,3,1,0,1,0,0
WBGene00000008,0,0,0,0,0,0,0,0


In [18]:
# Find % of genes expressed in EISA with 0 counts in raw table
expr_genes = expr_file_in[expr_file_in.index.isin(eisa_file.index.values)]

zerocount_genes = expr_genes[expr_genes.sum(axis=1) == 0]
p_zerocount = len(zerocount_genes) / len(expr_genes) * 100

print(f"{len(zerocount_genes)} out of {len(expr_genes)} ({p_zerocount}%) EISA genes have 0 intronic counts.")

# Intersect with gene expression file
# mart_file = mart_file[mart_file["Gene stable ID"].isin(expr_file.index.values)]

0 out of 1128 (0.0%) EISA genes have 0 intronic counts.


In [4]:
# Calculate no. of exons and introns
inex_df = mart_file.groupby(["Transcript stable ID", "Gene stable ID"]).size().to_frame(name="Number of exons").reset_index()

print(f"Total no. of expressed transcripts: {len(inex_df)}")
inex_df["Number of introns"] = inex_df["Number of exons"] - 1
print(f"No. of transcripts with introns: {len(inex_df[inex_df['Number of introns'] != 0])}")

avg_exon = inex_df['Number of exons'].sum() / len(inex_df)
avg_intron = inex_df['Number of introns'].sum() / len(inex_df)

print(f"""
Average no. of exons: {avg_exon}
Mode of no. of exons: {inex_df.mode()['Number of exons'][0]}
Average no. of introns: {avg_intron}
Mode of no. of introns: {inex_df.mode()['Number of introns'][0]}""")


Total no. of expressed transcripts: 23082
No. of transcripts with introns: 22927

Average no. of exons: 6.9035612165323625
Mode of no. of exons: 4.0
Average no. of introns: 5.9035612165323625
Mode of no. of introns: 3.0


In [18]:
# Calculate gene and exon lengths
mart_file["Gene length (bp)"] = abs(mart_file["Gene end (bp)"] - mart_file["Gene start (bp)"])
mart_file["Gene region length (bp)"] = 0

for id in mart_file["Transcript stable ID"].unique():
    exons = list(mart_file.loc[mart_file["Transcript stable ID"] == id, "Exon rank in transcript"])
    exons.sort()
    
    gb_start = mart_file.loc[(mart_file["Transcript stable ID"] == id) & (mart_file["Exon rank in transcript"] == exons[0]), "Exon region start (bp)"].iloc[0]
    gb_end = mart_file.loc[(mart_file["Transcript stable ID"] == id) & (mart_file["Exon rank in transcript"] == exons[-1]), "Exon region end (bp)"].iloc[0]
    mart_file.loc[mart_file["Transcript stable ID"] == id, "Gene region length (bp)"] = abs(gb_end - gb_start)

mart_file["Transcript length (bp)"] = abs(mart_file["Transcript end (bp)"] - mart_file["Transcript start (bp)"])
mart_file["Exon length (bp)"] = abs(mart_file["Exon region end (bp)"] - mart_file["Exon region start (bp)"])

In [20]:
# Calculate total exonic and intronic sizes per transcript
length_df = mart_file.groupby(["Transcript stable ID", "Gene stable ID", "Gene region length (bp)"])["Exon length (bp)"].sum().to_frame(name="Total exon size (bp)").reset_index()
length_df["Total intron size (bp)"] = length_df["Gene region length (bp)"] - length_df["Total exon size (bp)"]

total_exon = length_df['Total exon size (bp)']
total_intron = length_df['Total intron size (bp)']

print(f"""Total exome size: {total_exon.sum()}
Average total exonic sizes: {total_exon.sum() / len(length_df)}
Median total exonic sizes: {total_exon.median()}
Average exon size: {total_exon.sum() / len(length_df) / avg_exon}

Total introme size: {total_intron.sum()}
Average total intronic sizes: {total_intron.sum() / len(length_df)}
Median total intronic sizes: {total_intron.median()}
Average intron size: {total_intron.sum() / len(length_df) / avg_intron}

Proportion of introme vs. exome sizes: {total_intron.sum() / total_exon.sum()}""")

Total total exonic sizes: 37272940
Average total exonic sizes: 1614.805476128585
Median total exonic sizes: 1311.0
Average exon size: 233.90905439666642

Total total intronic sizes: 42788589
Average total intronic sizes: 1853.7643618403952
Median total intronic sizes: 761.0
Average intron size: 314.00781559596675

Proportion of intronic vs. exonic sizes: 1.147979982260589
