In [1]:
import pandas as pd
from Bio import GenBank
from Bio import SeqIO
import numpy as np

In [2]:
gene_map = 'old_regain_analysis/Gene ID and protein info.csv'
genbank = '../20z_genome_files/20z.gb'
plasmid_genbank = '../20z_genome_files/20z_plasmid.gb'
gene_exp_data = 'deseq_and_network_excel_sheets/raw_counts_with_ca.csv'
df_gene_map = pd.read_csv(gene_map) # .New_Locus, .Old_locus_tag, .Protein_id
#df_data = pd.read_excel(gene_exp_data, sheet_name='Sheet1')
df_data = pd.read_csv(gene_exp_data)
df_data['Old Locus'] = None
df_data['New Locus'] = None
df_data['Kb Loc'] = None
df_data['Protein Length'] = None

gb_record = SeqIO.read(genbank, "genbank")
plasmid_gb_record = SeqIO.read(plasmid_genbank, "genbank")

In [3]:
def index_genbank_features(gb_record, feature_type, qualifier) :
    answer = dict()
    for (index, feature) in enumerate(gb_record.features) :
        if feature.type==feature_type :
            if qualifier in feature.qualifiers :
                #There should only be one locus_tag per feature, but there
                #are usually several db_xref entries
                for value in feature.qualifiers[qualifier] :
                    if value in answer :
                        print("WARNING - Duplicate key %s for %s features %i and %i" \
                           % (value, feature_type, answer[value], index))
                    else :
                        answer[value] = index
    return answer

In [4]:
def new_from_old_locus(old_locus):
    new_locus = None
    try:
        new_locus = list(df_gene_map['.New_Locus'][df_gene_map['.Old_locus_tag'] == old_locus])[0]
    except:
        pass
    return new_locus

def old_locus_from_wp(wp):
    old_locus = None
    try:
        old_locus = list(df_gene_map['.Old_locus_tag'][df_gene_map['.Protein_id'] == wp])[0]
    except:
        pass
    return old_locus

In [5]:
pid_cds_index = index_genbank_features(gb_record,"CDS","protein_id")
mealz_cds_index = index_genbank_features(gb_record,"CDS","locus_tag")

plasmid_pid_cds_index = index_genbank_features(plasmid_gb_record,"CDS","protein_id")
plasmid_mealz_cds_index = index_genbank_features(plasmid_gb_record,"CDS","locus_tag")

for idx,row in df_data.iterrows():
    gb_record_index = ''
    plasmid_flag = False
    wp_flag = False
    wp = ""
    
    if 'WP_' in row[0]:
        wp_idx = row[0].find('WP_')
        wp = row[0][wp_idx:wp_idx+14]
        wp_flag = True
    
    if 'CCE' not in row[0]:
        if wp_flag:
            mid = old_locus_from_wp(wp)
            if mid is None or mid is np.nan:
                continue
        else:
            MEALZ_index = row[0].find('MEALZ')
            mid = row[0][MEALZ_index:MEALZ_index+10]
        try:
            gb_record_index = mealz_cds_index[mid]
        except:
            if not wp_flag:
                mid = row[0][MEALZ_index:MEALZ_index+11]
            try:
                gb_record_index = plasmid_mealz_cds_index[mid]
                plasmid_flag = True
            except:
                continue
                
    else:
        CCE_index = row[0].find('CCE')
        pid = row[0][CCE_index:CCE_index+10]
        try:
            gb_record_index = pid_cds_index[pid]
        except:
            gb_record_index = plasmid_pid_cds_index[pid]
            plasmid_flag = True

            
    if not plasmid_flag:  
        df_data.at[idx,'Predicted Function'] = gb_record.features[gb_record_index].qualifiers['product'][0]
        old_locus = gb_record.features[gb_record_index].qualifiers['locus_tag'][0]
        nt_loc = gb_record.features[gb_record_index].location
        nt_loc = str(nt_loc)
        colon_idx = nt_loc.find(':')
        start_nt_pos = float(nt_loc[1:colon_idx])
        kb_pos = start_nt_pos/1000
        df_data.at[idx, 'Kb Loc'] = kb_pos
        try:
            aa_length = gb_record.features[gb_record_index].qualifiers['translation'][0]
            aa_length = len(aa_length)
            df_data.at[idx, 'Protein Length'] = aa_length
        except:
            df_data.at[idx, 'Protein Length'] = None
    else:
        df_data.at[idx,'Predicted Function'] = plasmid_gb_record.features[gb_record_index].qualifiers['product'][0]
        old_locus = plasmid_gb_record.features[gb_record_index].qualifiers['locus_tag'][0]
        nt_loc = plasmid_gb_record.features[gb_record_index].location
        nt_loc = str(nt_loc)
        colon_idx = nt_loc.find(':')
        start_nt_pos = float(nt_loc[1:colon_idx])
        kb_pos = start_nt_pos/1000
        df_data.at[idx, 'Kb Loc'] = kb_pos
        try:
            aa_length = gb_record.features[gb_record_index].qualifiers['translation'][0]
            aa_length = len(aa_length)
            df_data.at[idx, 'Protein Length'] = aa_length
        except:
            df_data.at[idx, 'Protein Length'] = None
    
    
    df_data.at[idx, 'Old Locus'] = old_locus
    df_data.at[idx, 'New Locus'] = new_from_old_locus(old_locus)

   

In [6]:
df_data.to_excel('deseq_and_network_excel_sheets/update_raw_counts_with_ca.xlsx', index=False)