## Step#2
### This step is to organize your exonic microRNA files
* we only want to have the primary miRNA name and their host genes
(Run this section of code after running bedtools_human_exonic_miR.sh)

## Organize the output file(exonic miRs)
* Asssign gene types (noncoding vs protein-coding genes).
* Map the regions of exonic micrornas (5'UTR, 3'UTR, CDS).
* Map all the locations of miRs (exonic, intronic, no host mRNA, span intron-exon.)

In [1]:
#bedtools will generate the overlapped exonic miR files--->human_exonic_miR_NCBI.tsv
import pandas as pd
import os
import re
columns=['hostgene_chrom', 'hostgene_exonStarts', 'hostgene_exonEnds', 'hostgene_cdsStart', 'hostgene_cdsEnd'
         , 'hostgene_strand','hostgene_exonCounts','hostgene_txStarts','hostgene_txEnds','mature_host_mRNA_Length',
       'hostgene_Name', 'hostgene_NCBI_ID','miR_chrom', 'miR_exonStarts', 'miR_exonEnds', 'miR_type'
         , 'miR_Name',
       'miR_strand', 'miR_ID']

In [2]:
current_dir=os.getcwd() # Get current directory

In [3]:
# Organize and finalize the exonic microRNA files from the bedtools result

human_exonicmiR_df=pd.read_csv(f'{current_dir}/Human_tsv_file/human_exonic_miR_NCBI.tsv', 
                               sep='\t', header=None, names=columns)

In [4]:
human_exonicmiR_df

Unnamed: 0,hostgene_chrom,hostgene_exonStarts,hostgene_exonEnds,hostgene_cdsStart,hostgene_cdsEnd,hostgene_strand,hostgene_exonCounts,hostgene_txStarts,hostgene_txEnds,mature_host_mRNA_Length,hostgene_Name,hostgene_NCBI_ID,miR_chrom,miR_exonStarts,miR_exonEnds,miR_type,miR_Name,miR_strand,miR_ID
0,chr1,175953957,175972080,175971939,176149067,-,13,175953957,176162971,19460,COP1,XM_047427799.1,chr1,175968370,175968479,miRNA_primary_transcript,Name=hsa-mir-1843,-,Alias=MI0032314
1,chr1,175953957,175972080,175971939,176149067,-,13,175953957,176162971,19460,COP1,XM_047427799.1,chr1,175968440,175968459,miRNA,Name=hsa-miR-1843,-,Alias=MIMAT0039764
2,chr1,175953957,175972080,175971939,176149067,-,13,175953957,176175979,19427,COP1,XM_017002080.3,chr1,175968370,175968479,miRNA_primary_transcript,Name=hsa-mir-1843,-,Alias=MI0032314
3,chr1,175953957,175972080,175971939,176149067,-,13,175953957,176175979,19427,COP1,XM_017002080.3,chr1,175968440,175968459,miRNA,Name=hsa-miR-1843,-,Alias=MIMAT0039764
4,chr1,175953957,175972080,175971939,176175951,-,17,175953957,176184692,19706,COP1,XM_017002071.2,chr1,175968370,175968479,miRNA_primary_transcript,Name=hsa-mir-1843,-,Alias=MI0032314
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1493,chr22,46086166,46095238,46097919,46097919,+,3,46086166,46097919,11634,LOC112268288,XR_002958733.2,chr22,46091059,46091080,miRNA,Name=hsa-miR-3619-5p,+,Alias=MIMAT0017999
1494,chr22,46086166,46095238,46097919,46097919,+,3,46086166,46097919,11634,LOC112268288,XR_002958733.2,chr22,46091090,46091111,miRNA,Name=hsa-miR-3619-3p,+,Alias=MIMAT0019219
1495,chr22,46086166,46095563,46097919,46097919,+,3,46086166,46097919,11634,LOC112268288,XR_002958732.2,chr22,46091044,46091126,miRNA_primary_transcript,Name=hsa-mir-3619,+,Alias=MI0016009
1496,chr22,46086166,46095563,46097919,46097919,+,3,46086166,46097919,11634,LOC112268288,XR_002958732.2,chr22,46091059,46091080,miRNA,Name=hsa-miR-3619-5p,+,Alias=MIMAT0017999


In [5]:
### Extract only primary miR transcript
human_exonicmiR_df=human_exonicmiR_df.loc[human_exonicmiR_df['miR_type']=="miRNA_primary_transcript"]

In [6]:
human_exonicmiR_df.shape

(514, 19)

In [7]:
#Change string values in the columns of exonic miR files

# Reset the index
human_exonicmiR_df = human_exonicmiR_df.reset_index(drop=True)

# Create a new DataFrame with the updated "miR_name" column by remove "Name=" from each box
new_df = pd.DataFrame({"miR_name": human_exonicmiR_df["miR_Name"].str.replace("Name=", "")})

# Concatenate the new DataFrame with the remaining columns from the original DataFrame
human_exonicmiR_df.insert(12,"miR_name", new_df['miR_name'])
human_exonicmiR_df.drop(columns=["miR_Name"],inplace=True)

In [8]:
human_exonicmiR_df.head()

Unnamed: 0,hostgene_chrom,hostgene_exonStarts,hostgene_exonEnds,hostgene_cdsStart,hostgene_cdsEnd,hostgene_strand,hostgene_exonCounts,hostgene_txStarts,hostgene_txEnds,mature_host_mRNA_Length,hostgene_Name,hostgene_NCBI_ID,miR_name,miR_chrom,miR_exonStarts,miR_exonEnds,miR_type,miR_strand,miR_ID
0,chr1,175953957,175972080,175971939,176149067,-,13,175953957,176162971,19460,COP1,XM_047427799.1,hsa-mir-1843,chr1,175968370,175968479,miRNA_primary_transcript,-,Alias=MI0032314
1,chr1,175953957,175972080,175971939,176149067,-,13,175953957,176175979,19427,COP1,XM_017002080.3,hsa-mir-1843,chr1,175968370,175968479,miRNA_primary_transcript,-,Alias=MI0032314
2,chr1,175953957,175972080,175971939,176175951,-,17,175953957,176184692,19706,COP1,XM_017002071.2,hsa-mir-1843,chr1,175968370,175968479,miRNA_primary_transcript,-,Alias=MI0032314
3,chr1,175953957,175972080,175971939,176175951,-,16,175953957,176184692,19646,COP1,XM_017002073.2,hsa-mir-1843,chr1,175968370,175968479,miRNA_primary_transcript,-,Alias=MI0032314
4,chr1,175953957,175972080,175971939,176206978,-,20,175953957,176207286,20618,COP1,XM_017002059.3,hsa-mir-1843,chr1,175968370,175968479,miRNA_primary_transcript,-,Alias=MI0032314


In [9]:
human_exonicmiR_df.shape

(514, 19)

In [11]:
csv_file_path=f'{current_dir}/Human_csv_file'

In [12]:
#save the human exonic data (tsv) file as a csv file
human_exonicmiR_df.to_csv(f'{csv_file_path}/human_exonic_miR_NCBI.csv',index=False)
#convert to csv file

## Before you run bedtools_intronic_miRs.sh, reshape the dataframe first and map intronic miRNA with the following steps

The reason for reshaping the dataframe are the following:
* We need to use txStart, txEnd coordinates instead of exon coordinates to find intronic miRNAs. 
* The finalized tsv files will be "human_all_genes_no_miR_df_TX_loc_NCBI.tsv".
* We will use this new tsv file to overlap the coordinates of miR to map all the intronic miRNAs.
* Remember the new file has txStart and txEnd locations instead of exon locations.
* bedtools_intronic_miRs.sh will also map the rest of miRNAs that don't have mRNA host genes.

In [13]:
import import_ipynb
from utils.Data import HUMAN

importing Jupyter notebook from /Users/tyronchang/Desktop/exonic-microrna-analysis/github/Human/utils/Data.ipynb


### First do the same steps that you did for exonic miRNAs to reshape the dataframe
* ##### You don't have to use exon coordinates of the host genes 

In [14]:
human=HUMAN(f'{current_dir}/UCSC_data_Human/human_genes_010324_NCBI.csv')#human gene coordinates tables

In [15]:
human.df.columns

Index(['#"bin"', 'name', 'chrom', 'strand', 'txStart', 'txEnd', 'cdsStart',
       'cdsEnd', 'exonCount', 'exonStarts', 'exonEnds', 'score', 'name2',
       'cdsStartStat', 'cdsEndStat', 'exonFrames'],
      dtype='object')

In [16]:
"""This function reorganize and clean the columns by doing the following:"""
human.col_preprocess()
# human.df.columns=gene.df.columns.str.replace("#","") 
### Get rid of "#" symbols in the column names.
# gene.df.columns=gene.df.columns.str.replace('"',"")
### Remove '' in the column names.

In [17]:
human.drop()### drop 'bin' column

In [18]:
human.df.rename(columns={'name2':'gene_name'},inplace=True)
### Replace the column name

In [19]:
# Function to calculate mature mRNA length
def calculate_mrna_length(row):
    starts = list(map(int, row['exonStarts'].split(',')[:-1]))
    ends = list(map(int, row['exonEnds'].split(',')[:-1]))
    lengths = [end - start for start, end in zip(starts, ends)]
    return sum(lengths)

# Apply the function to each row and create a new column for mRNA length
human.df['mature_mRNA_Length'] = human.df.apply(calculate_mrna_length, axis=1)

In [20]:
#### This dataframe has all the transctiption starts and ends coordinates
human_gene_df_tx=human.select(['chrom','txStart','txEnd','cdsStart','cdsEnd','strand',
                        'gene_name','name'])

In [21]:
human_gene_df_tx.rename(columns={'gene_name':'Name','name':'ID'},inplace=True)

In [22]:
#Use the same regex again
p=r'^MIR\d+(-\d+)*$|^MIR\d+[A-Z]([A-Z]|\d)*$(?<!HG)$|^hsa-mir|^MIRLET\d[A-Z]\d*$(?<!HG)$'

In [23]:
#### Unlike the previous file, this file has txStart and txEnd coordinates
human_all_genes_no_miR_df_TX_loc=human_gene_df_tx.loc[~human_gene_df_tx['Name'].str.contains(
p, flags=re.I,regex=True)]####Remove all miR from the data.

  human_all_genes_no_miR_df_TX_loc=human_gene_df_tx.loc[~human_gene_df_tx['Name'].str.contains(


In [24]:
### this file has all the transcription starts and ends coordinates but no miRNAs
human_all_genes_no_miR_df_TX_loc.to_csv(f'{current_dir}/Human_tsv_file/human_all_genes_no_miR_df_TX_loc_NCBI.tsv',sep='\t',index=False)

In [25]:
#### Run bedtools_human_nonexonic_miR.sh and then bedtools_human_intronic_miR.sh

In [26]:
##### use this file, bedtools_human_intronic_miR.sh, to find intronic miRNAs.