## Step#2
### This step is to organize your exonic microRNA files
* we only want to have the primary miRNA name and their host genes
(Run this section of code after running bedtools_mouse_exonic_miR.sh)

## Organize the output file(exonic miRs)
* Asssign gene types (noncoding vs protein-coding genes).
* Map the regions of exonic micrornas (5'UTR, 3'UTR, CDS).
* Map all the locations of miRs (exonic, intronic, no host mRNA, span intron-exon.)

### Mouse

In [1]:
#bedtools will generate the overlapped exonic miR files--->mouse_exonic_miR_NCBI.tsv
import pandas as pd
import os
import re
columns=['hostgene_chrom', 'hostgene_exonStarts', 'hostgene_exonEnds', 'hostgene_cdsStart', 'hostgene_cdsEnd'
         , 'hostgene_strand','hostgene_exonCounts','hostgene_txStarts','hostgene_txEnds','mature_host_mRNA_Length',
       'hostgene_Name', 'hostgene_NCBI_ID','miR_chrom', 'miR_exonStarts', 'miR_exonEnds', 'miR_type'
         , 'miR_Name',
       'miR_strand', 'miR_ID']

In [2]:
current_dir=os.getcwd() # Get current directory

In [3]:
# Organize and finalize the exonic microRNA files from the bedtools result

mouse_exonicmiR_df=pd.read_csv(f'{current_dir}/Mouse_tsv_file/mouse_exonic_miR_NCBI.tsv', sep='\t', header=None, names=columns)

In [4]:
mouse_exonicmiR_df.head()

Unnamed: 0,hostgene_chrom,hostgene_exonStarts,hostgene_exonEnds,hostgene_cdsStart,hostgene_cdsEnd,hostgene_strand,hostgene_exonCounts,hostgene_txStarts,hostgene_txEnds,mature_host_mRNA_Length,hostgene_Name,hostgene_NCBI_ID,miR_chrom,miR_exonStarts,miR_exonEnds,miR_type,miR_Name,miR_strand,miR_ID
0,chr1,195033822,195037908,195037908,195037908,+,8,195017398,195037908,5195,A330023F24Rik,NR_015566.2,chr1,195037040,195037120,miRNA_primary_transcript,Name=mmu-mir-29b-2,+,Alias=MI0000712
1,chr1,195033822,195037908,195037908,195037908,+,8,195017398,195037908,5195,A330023F24Rik,NR_015566.2,chr1,195037050,195037074,miRNA,Name=mmu-miR-29b-2-5p,+,Alias=MIMAT0017063
2,chr1,195033822,195037908,195037908,195037908,+,8,195017398,195037908,5195,A330023F24Rik,NR_015566.2,chr1,195037091,195037113,miRNA,Name=mmu-miR-29b-3p,+,Alias=MIMAT0000127
3,chr1,195033822,195037908,195037908,195037908,+,8,195017398,195037908,5195,A330023F24Rik,NR_015566.2,chr1,195037547,195037634,miRNA_primary_transcript,Name=mmu-mir-29c,+,Alias=MI0000577
4,chr1,195033822,195037908,195037908,195037908,+,8,195017398,195037908,5195,A330023F24Rik,NR_015566.2,chr1,195037562,195037583,miRNA,Name=mmu-miR-29c-5p,+,Alias=MIMAT0004632


In [5]:
### Extract only primary miR transcript
mouse_exonicmiR_df=mouse_exonicmiR_df.loc[mouse_exonicmiR_df['miR_type']=="miRNA_primary_transcript"]

In [6]:
mouse_exonicmiR_df.shape

(339, 19)

In [7]:
#Change string values in the columns of exonic miR files

# Reset the index
mouse_exonicmiR_df = mouse_exonicmiR_df.reset_index(drop=True)

# Create a new DataFrame with the updated "miR_name" column by remove "Name=" from each box
new_df = pd.DataFrame({"miR_name": mouse_exonicmiR_df["miR_Name"].str.replace("Name=", "")})

# Concatenate the new DataFrame with the remaining columns from the original DataFrame
mouse_exonicmiR_df.insert(12,"miR_name", new_df['miR_name'])
mouse_exonicmiR_df.drop(columns=["miR_Name"],inplace=True)

In [8]:
mouse_exonicmiR_df.head()

Unnamed: 0,hostgene_chrom,hostgene_exonStarts,hostgene_exonEnds,hostgene_cdsStart,hostgene_cdsEnd,hostgene_strand,hostgene_exonCounts,hostgene_txStarts,hostgene_txEnds,mature_host_mRNA_Length,hostgene_Name,hostgene_NCBI_ID,miR_name,miR_chrom,miR_exonStarts,miR_exonEnds,miR_type,miR_strand,miR_ID
0,chr1,195033822,195037908,195037908,195037908,+,8,195017398,195037908,5195,A330023F24Rik,NR_015566.2,mmu-mir-29b-2,chr1,195037040,195037120,miRNA_primary_transcript,+,Alias=MI0000712
1,chr1,195033822,195037908,195037908,195037908,+,8,195017398,195037908,5195,A330023F24Rik,NR_015566.2,mmu-mir-29c,chr1,195037547,195037634,miRNA_primary_transcript,+,Alias=MI0000577
2,chr1,20682712,20682958,20682958,20682958,+,3,20669881,20682958,521,Lincmd1,NR_131249.1,mmu-mir-133b,chr1,20682769,20682887,miRNA_primary_transcript,+,Alias=MI0000821
3,chr1,86351980,86352127,86352127,86352127,-,1,86351980,86352127,147,Gm24148,NR_106184.1,mmu-mir-3535,chr1,86351981,86352127,miRNA_primary_transcript,-,Alias=MI0026036
4,chr1,134946233,134946299,134946299,134946299,-,1,134946233,134946299,66,Gm51265,NR_162775.1,mmu-mir-5104b,chr1,134946234,134946299,miRNA_primary_transcript,-,Alias=MI0040615


In [9]:
mouse_exonicmiR_df.shape

(339, 19)

In [10]:
csv_file_path=f'{current_dir}/Mouse_csv_file'

In [28]:
#save the mouse exonic data (tsv) file as a csv file
mouse_exonicmiR_df.to_csv(f'{csv_file_path}/mouse_exonic_miR_NCBI.csv',index=False)
#convert to csv file

## Before you run bedtools_intronic_miRs.sh, reshape the dataframe first and map intronic miRNA with the following steps

The reason for reshaping the dataframe are the following:
* We need to use txStart, txEnd coordinates instead of exon coordinates to find intronic miRNAs. 
* The finalized tsv files will be "mouse_all_genes_no_miR_df_TX_loc_NCBI.tsv".
* We will use this new tsv file to overlap the coordinates of miR to map all the intronic miRNAs.
* Remember the new file has txStart and txEnd locations instead of exon locations.
* bedtools_intronic_miRs.sh will also map the rest of miRNAs that don't have mRNA host genes.

In [12]:
import import_ipynb
from utils.Data import MOUSE

importing Jupyter notebook from /Users/tyronchang/Desktop/exonic-microrna-analysis/github3/Mouse/utils/Data.ipynb


### First do the same steps that you did for exonic miRNAs to reshape the dataframe
* ##### You don't have to use exon coordinates of the host genes 

In [14]:
mouse=MOUSE(f'{current_dir}/UCSC_data_Mouse/mouse_genes_020324_NCBI.csv')#mouse gene coordinates tables

In [16]:
mouse.df.columns

Index(['#"bin"', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart',
       'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'score', 'name2',
       'cdsStartStat', 'cdsEndStat', 'exonFrames'],
      dtype='object')

In [17]:
"""This function reorganize and clean the columns by doing the following:"""
mouse.col_preprocess()
# mouse.df.columns=gene.df.columns.str.replace("#","") 
### Get rid of "#" symbols in the column names.
# gene.df.columns=gene.df.columns.str.replace('"',"")
### Remove '' in the column names.

In [18]:
mouse.drop()### drop 'bin' column

In [19]:
mouse.df.rename(columns={'name2':'gene_name'},inplace=True)
### Replace the column name

In [20]:
# Function to calculate mature mRNA length
def calculate_mrna_length(row):
    starts = list(map(int, row['exonStarts'].split(',')[:-1]))
    ends = list(map(int, row['exonEnds'].split(',')[:-1]))
    lengths = [end - start for start, end in zip(starts, ends)]
    return sum(lengths)

# Apply the function to each row and create a new column for mRNA length
mouse.df['mature_mRNA_Length'] = mouse.df.apply(calculate_mrna_length, axis=1)

In [21]:
#### This dataframe has all the transctiption starts and ends coordinates
mouse_gene_df_tx=mouse.select(['chrom','txStart','txEnd','cdsStart','cdsEnd','strand',
                        'gene_name','name'])

In [22]:
mouse_gene_df_tx.rename(columns={'gene_name':'Name','name':'ID'},inplace=True)

In [23]:
#Use the same regex again
mouse_p=r'^Mir\d+(-\d+)*$|^Mir\d+[a-z](-\d+)*$|^Mir\d+[a-z]([a-z]|\d)*$(?<!hg)$|^Mirlet\d[a-z]-*\d*$(?<!hg)$'

In [24]:
#### Unlike the previous file, this file has txStart and txEnd coordinates
mouse_all_genes_no_miR_df_TX_loc=mouse_gene_df_tx.loc[~mouse_gene_df_tx['Name'].str.contains(
mouse_p, flags=re.I,regex=True)]####Remove all miR from the data.

  mouse_all_genes_no_miR_df_TX_loc=mouse_gene_df_tx.loc[~mouse_gene_df_tx['Name'].str.contains(


In [25]:
### this file has all the transcription starts and ends coordinates but no miRNAs
mouse_all_genes_no_miR_df_TX_loc.to_csv(f'{current_dir}/Mouse_tsv_file/mouse_all_genes_no_miR_df_TX_loc_NCBI.tsv',sep='\t',index=False)

In [26]:
#### Run bedtools_mouse_nonexonic_miR.sh and then bedtools_mouse_intronic_miR.sh

In [27]:
##### use this file, bedtools_mouse_intronic_miR.sh, to find intronic miRNAs.