### Mouse data ( 1st step of data cleaning: reshape column)

In [1]:
import pandas as pd
import os
import re
import matplotlib.pyplot as plt

In [2]:
current_dir=os.getcwd()

In [3]:
import import_ipynb
from utils.Data import MOUSE

importing Jupyter notebook from /Users/tyronchang/Desktop/exonic-microrna-analysis/github3/Mouse/utils/Data.ipynb


In [4]:
mouse=MOUSE(f'{current_dir}/UCSC_data_Mouse/mouse_genes_020324_NCBI.csv')

In [5]:
mouse.col_preprocess()

In [6]:
mouse.drop()

In [7]:
mouse.df.rename(columns={'name2':'gene_name'},inplace=True)
### Replace the column name

In [8]:
# Function to calculate mRNA length
def calculate_mrna_length(row):
    starts = list(map(int, row['exonStarts'].split(',')[:-1]))
    ends = list(map(int, row['exonEnds'].split(',')[:-1]))
    lengths = [end - start for start, end in zip(starts, ends)]
    return sum(lengths)

# Apply the function to each row and create a new column for mRNA length
mouse.df['mature_mRNA_Length'] = mouse.df.apply(calculate_mrna_length, axis=1)

In [9]:
mouse_gene_df=mouse.select(['chrom','exonStarts','exonCount',
                        'gene_name','name'])

In [10]:
mouse_gene_df['exonStarts']=mouse_gene_df['exonStarts'].str.replace(r',\Z','',regex=True)
### remove the comma by the end of the column prior to splitting the data

In [11]:
mouse_gene_df['exonStarts']=mouse_gene_df['exonStarts'].str.split(",")

In [12]:
mouse_gene_df=mouse_gene_df.explode('exonStarts')

In [13]:
mouse_gene_df=mouse_gene_df.reset_index(drop=True)

In [14]:
mouse_gene_df_2=mouse.select(['exonEnds','cdsStart','cdsEnd','strand','exonCount','txStart','txEnd','mature_mRNA_Length','gene_name','name'])

In [15]:
mouse_gene_df_2['exonEnds']=mouse_gene_df_2['exonEnds'].str.replace(r',\Z','',regex=True)
### remove the comma at the end of column

In [16]:
mouse_gene_df_2['exonEnds']=mouse_gene_df_2['exonEnds'].str.split(",")

In [17]:
mouse_gene_df_2=mouse_gene_df_2.explode('exonEnds')

In [18]:
mouse_gene_df_2.head(50).reset_index(drop=True)

Unnamed: 0,exonEnds,cdsStart,cdsEnd,strand,exonCount,txStart,txEnd,mature_mRNA_Length,gene_name,name
0,134203590,134202950,134234733,-,2,134199214,134234856,4570,Adora1,NM_001291928.1
1,134234856,134202950,134234733,-,2,134199214,134234856,4570,Adora1,NM_001291928.1
2,134203590,134202950,134234355,-,2,134199214,134235457,5819,Adora1,NM_001008533.3
3,134235457,134202950,134234355,-,2,134199214,134235457,5819,Adora1,NM_001008533.3
4,134203590,134202950,134234355,-,3,134199214,134235457,5038,Adora1,NM_001282945.1
5,134234446,134202950,134234355,-,3,134199214,134235457,5038,Adora1,NM_001282945.1
6,134235457,134202950,134234355,-,3,134199214,134235457,5038,Adora1,NM_001282945.1
7,134203590,134202950,134234355,-,3,134199214,134235457,5004,Adora1,NM_001039510.2
8,134234412,134202950,134234355,-,3,134199214,134235457,5004,Adora1,NM_001039510.2
9,134235457,134202950,134234355,-,3,134199214,134235457,5004,Adora1,NM_001039510.2


In [19]:
mouse_gene_df_2.rename(columns={'gene_name':'Name','exonCount':'exon_count','name':'ID'},inplace=True)

In [20]:
mouse_gene_df_2 = mouse_gene_df_2.reset_index(drop=True)

### Concatanate the dataframe (Reshape the dataframe) and remove all miRNA dataset

In [21]:
finalized_mouse_gene_df=pd.concat([mouse_gene_df,mouse_gene_df_2], axis=1)#### This file contains all mouse genes!

In [22]:
finalized_mouse_gene_df.shape

(1385818, 15)

In [23]:
finalized_mouse_gene_df.columns

Index(['chrom', 'exonStarts', 'exonCount', 'gene_name', 'name', 'exonEnds',
       'cdsStart', 'cdsEnd', 'strand', 'exon_count', 'txStart', 'txEnd',
       'mature_mRNA_Length', 'Name', 'ID'],
      dtype='object')

In [24]:
finalized_mouse_gene_df.drop(columns=['exonCount','gene_name','name'],inplace=True)

In [25]:
finalized_mouse_gene_df.columns

Index(['chrom', 'exonStarts', 'exonEnds', 'cdsStart', 'cdsEnd', 'strand',
       'exon_count', 'txStart', 'txEnd', 'mature_mRNA_Length', 'Name', 'ID'],
      dtype='object')

In [26]:
####This regex only extract microRNA but not lncRNA.
mouse_p=r'^Mir\d+(-\d+)*$|^Mir\d+[a-z](-\d+)*$|^Mir\d+[a-z]([a-z]|\d)*$(?<!hg)$|^Mirlet\d[a-z]-*\d*$(?<!hg)$'

In [27]:
mouse_all_genes_no_miR_df=finalized_mouse_gene_df.loc[~finalized_mouse_gene_df['Name'].str.contains(
mouse_p, flags=re.I,regex=True)]####Remove all miR from the data. So this file has all mouse genes except for miRNAs

  mouse_all_genes_no_miR_df=finalized_mouse_gene_df.loc[~finalized_mouse_gene_df['Name'].str.contains(


In [28]:
mouse_all_genes_no_miR_df.shape

(1384536, 12)

In [29]:
mouse_all_genes_no_miR_df.columns

Index(['chrom', 'exonStarts', 'exonEnds', 'cdsStart', 'cdsEnd', 'strand',
       'exon_count', 'txStart', 'txEnd', 'mature_mRNA_Length', 'Name', 'ID'],
      dtype='object')

In [30]:
### save the file
mouse_all_genes_no_miR_df.to_csv(f'{current_dir}/Mouse_bash_script/mouse_all_genes_no_miR_df_NCBI.tsv',sep='\t',index=False)

### Extract miRNA data

In [31]:
##extract mouse miRs
mouse_df_miR=finalized_mouse_gene_df.loc[finalized_mouse_gene_df['Name'].str.contains(mouse_p,flags=re.I,regex=True)]

  mouse_df_miR=finalized_mouse_gene_df.loc[finalized_mouse_gene_df['Name'].str.contains(mouse_p,flags=re.I,regex=True)]


In [32]:
mouse_df_miR.shape ### this dataset do not have lncRNA (e.g. miR-210HG) It has only miRNAs

(1282, 12)

In [33]:
mouse_df_miR.head(5)

Unnamed: 0,chrom,exonStarts,exonEnds,cdsStart,cdsEnd,strand,exon_count,txStart,txEnd,mature_mRNA_Length,Name,ID
56853,chr1,12425985,12426106,12426106,12426106,+,1,12425985,12426106,121,Mir6341,NR_105759.1
57475,chr1,20679009,20679082,20679082,20679082,+,1,20679009,20679082,73,Mir206,NR_029593.1
57476,chr1,20682768,20682887,20682887,20682887,+,1,20682768,20682887,119,Mir133b,NR_029902.1
57636,chr1,23272268,23272339,23272339,23272339,+,1,23272268,23272339,71,Mir30a,NR_029533.1
57637,chr1,23291700,23291784,23291784,23291784,+,1,23291700,23291784,84,Mir30c-2,NR_029717.1


In [34]:
###save the file for microRNA dataset
mouse_df_miR.to_csv(f'{current_dir}/Mouse_bash_script/df_mousemiR_NCBI.tsv',sep="\t",index=False)

## 6. Before you run bedtools_intronic_miRs.sh, reshape the dataframe first and map intronic miRNA with the following steps

The reason for reshaping the dataframe are the following:
* We need to use txStart, txEnd coordinates instead of exon coordinates to find intronic miRNAs. 
* The finalized tsv files will be "mouse_all_genes_no_miR_df_TX_loc_NCBI.tsv".
* We will use this new tsv file to overlap the coordinates of miR to map all the intronic miRNAs.
* Remember the new file has txStart and txEnd locations instead of exon locations.
* bedtools_intronic_miRs.sh will also map the rest of miRNAs that don't have mRNA host genes.