In [None]:
import sys
sys.path.append(r'../script/')
from protein_composition import *
import pandas as pd
import re
import os
import numpy as np

In [None]:
# strain='BW25113'
# strain='DH1'
# strain='MG1655'
strain='W3110'
# strain='Bacillus'
# strain='Yeast_single_cell'
# strain='Corynebacterium_RNA_seq'

# Data description

## E.coli
### Literature: Multi-omics integration accurately predicts cellular state in unexplored conditions for Escherichia coli
### Type: Absolute-level quantification

## Yeast_single_cell
### https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE122392
### Literature: Sensitive, high-throughput single-cell RNA-Seq reveals within-clonal transcript-correlations in yeast populations
### Type: Single-cell RNA-Seq

## Bacillus
### microarray data and used Bioconductor’s limma R package for quantile normalization.
### Literature: Condition-Dependent Transcriptome Reveals High-Level Regulatory Architecture in Bacillus subtilis
### Type: Microarray

## Corynebacterium glutamicum
### RNA_seq data using Nextflow workflow
### Type: RNA-Seq

In [None]:
if re.search('_single_cell',strain):
    proteome_file='../basic_data/omics_data/%s_transcriptome.csv'%strain
elif re.search('_RNA_seq',strain):
    proteome_file='../basic_data/omics_data/%s_transcriptome.csv'%strain
else:
    proteome_file='../basic_data/omics_data/%s_trans_transcriptome.csv'%strain

In [None]:
result_file='../analysis_result/initial_data/%s/'%strain
if not os.path.exists(result_file):
    os.makedirs(result_file)

# Get protein MW/Sequence corresponding to gene id using uniprot API   
https://www.uniprot.org/help/uniprotkb_column_names   
https://www.uniprot.org/help/api_idmapping   

In [None]:
gene_uniprot_outfile='../download_data/Gene_name2ACC_MW_%s.txt'%strain
my_pro_exp=pd.read_csv(proteome_file,index_col=0) 
print(my_pro_exp.shape) 

GENENAME_2_ACC_from_uniprot(my_pro_exp.index,gene_uniprot_outfile)

In [None]:
gene_uniprot_select_outfile= result_file + 'Gene_name2ACC_MW_%s_select.txt'%strain

P_ACC_SEQ2ACC_MW_select=gene_uniprot_select(gene_uniprot_outfile,gene_uniprot_select_outfile)
P_ACC_SEQ2ACC_MW_select.head(5)

## Get amino acid composiotion from protein sequence

In [None]:
gene_uniprot_select_outfile=result_file + 'Gene_name2ACC_MW_%s_select.txt'%strain
amino_acid_information_file='../basic_data/amino_acid_information.csv'
seq_amino_composition_file=result_file + 'seq_amino_composition_%s.csv'%strain
seq_amino_composition_MW_file=result_file + 'seq_amino_composition_g_g_%s.csv'%strain

[seq_amino_composition,seq_amino_composition_MW]=protein_amino_composition(gene_uniprot_select_outfile,amino_acid_information_file,seq_amino_composition_file,seq_amino_composition_MW_file)
seq_amino_composition_MW.head(5)

# Amino acid composition condsider protein sequence (normalized 1g protein)

In [None]:
seq_amino_composition_MW_file=result_file + 'seq_amino_composition_g_g_%s.csv'%strain
seq_amino_composition_MW_norm_file=result_file + 'seq_amino_composition_g_g_norm_%s.csv'%strain

seq_amino_composition_g_g_norm=amino_mass_norm(seq_amino_composition_MW_file,seq_amino_composition_MW_norm_file)
seq_amino_composition_g_g_norm.head(5)

# Amino acids composition of each protein (g / g protein) consider expression level under different conditions

In [None]:
gene_uniprot_select_outfile=result_file + 'Gene_name2ACC_MW_%s_select.txt'%strain

my_pro_exp=pd.read_csv(proteome_file,index_col=0) 
P_ACC_SEQ2ACC_MW=pd.read_csv(gene_uniprot_select_outfile,index_col='Gene ID')
my_pro_exp.head(5)

In [None]:
gene_uniprot_select_outfile=result_file + 'Gene_name2ACC_MW_%s_select.txt'%strain
protein_expression_mass_norm_outfile=result_file + '%s_exp_onecell.json'%strain

protein_expression_mass_norm_json=protein_expression_mass_norm(proteome_file,gene_uniprot_select_outfile,protein_expression_mass_norm_outfile)

# Amino acids composition of each condaition (g / g total protein) consider expression level

In [None]:
seq_amino_composition_MW_norm_file=result_file + 'seq_amino_composition_g_g_norm_%s.csv'%strain
protein_expression_mass_norm_outfile=result_file + '%s_exp_onecell.json'%strain
amino_composition_norm_onecell_outfile=result_file + 'amino_composition_g_g_norm_onecell_%s.json'%strain
amino_acid_expression_mass_norm_json=amino_acid_expression_mass_norm(protein_expression_mass_norm_outfile,seq_amino_composition_MW_norm_file,amino_composition_norm_onecell_outfile)

In [None]:
amino_composition_norm_onecell_outfile=result_file + 'amino_composition_g_g_norm_onecell_%s.json'%strain
amino_composition_norm_onecell=json_load(amino_composition_norm_onecell_outfile)
amino_composition_norm_onecell_df=pd.DataFrame()
for key, value in amino_composition_norm_onecell.items():
    for key2 in value.keys():
        amino_composition_norm_onecell_df.loc[key,key2]=value[key2]['total']
        
amino_composition_norm_onecell_df_outfile=result_file + 'amino_composition_proteome_by_condition_%s.csv'%strain
amino_composition_norm_onecell_df.to_csv(amino_composition_norm_onecell_df_outfile, header=True, index=True)