# Import Necessary Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from yome import Session
from yome.models import *
from yome.util import to_df
import pandas as pd
from os.path import dirname, realpath, join
import glob
import logging

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

ModuleNotFoundError: No module named 'sqlalchemy'

# Import/Format Necessary Tables

## Import EcoFit Ratios for Specific Phenotypes (specphes)

In [3]:
genefit = pd.read_csv('fit_logratios_good.tab', sep='\t')

In [4]:
meta = pd.read_csv('expsUsed.txt', sep='\t')
meta = meta.drop(['SetName', 'Index','Drop', 'num',],1)

In [5]:
phen = pd.read_csv('key_phen.txt', sep='\t')
phen.columns = phen.columns.str.replace('specphe.','')

match = genefit[genefit.locusId.isin(phen.locusId)]

for index, row in phen.iterrows():
    for index2, row2 in match.iterrows():
        if row.locusId == row2.locusId:
            phen.loc[index, 'b_num'] = row2.sysName

phen = phen.drop(['Condition_2', 'Concentration_2', 'Units_2', 'locusId' ],1)

In [6]:
#phen.to_csv('ecofit_specphen_edited.csv', sep = '\t', index = False)
#meta_full = meta[['name', 'Description', 'Condition_1', 'Condition_2','Media', 'Growth.Method', 'Temperature', 'pH', 'Timecourse.Sample']]
#meta_full.to_csv('ecofit_meta_full.csv', sep = '\t', index = False)


## Import EcoFit Ratios and t-test like statistic for All Experiments

In [7]:
allfit = pd.read_csv('fit_logratios_good.tab', sep = '\t')
all_bnums = allfit.sysName
allfit = allfit.drop(['locusId', 'desc', 'comb', 'sysName'],1)
all_t = pd.read_csv('fit_t.tab', sep = '\t')
all_t = all_t.drop(['locusId', 'desc', 'sysName'], 1)

FileNotFoundError: File b'fit_t.tab' does not exist

## Import Y-ome and non Y-ome

In [None]:
session = Session()
genes = to_df(
    session.query(Gene.locus_id, KnowledgebaseGene.primary_name, Synonym.synonym, Knowledgebase.name, KnowledgebaseGene.annotation_quality)
    .join(KnowledgebaseGene)
    .join(Synonym, KnowledgebaseGene.id == Synonym.ref_id)
    .join(Knowledgebase)
    .filter(Knowledgebase.name == 'Y-ome')
    , ['bnum', 'gene_name', 'syn', 'knowledgebase', 'ann_quality']
) 

session.close()

In [None]:
change = genes[genes.syn==genes.gene_name]
genes.loc[change.index, 'syn'] = 'None'
genes = genes.drop_duplicates()
genes.index = range(0, len(genes))

drop1 = genes[genes.syn.isin(genes.bnum)]
drop2 = genes[genes.syn.isin(genes.gene_name)]
drop = pd.concat([drop1, drop2], 0)

genes = genes.drop(genes.index[[drop.index]])
yome = genes[genes.ann_quality == 'low']
well = genes[genes.ann_quality == 'high']

yome_uniq = yome.drop(['syn'],1)
yome_uniq = yome_uniq.drop_duplicates()
well_uniq = well.drop(['syn'],1)
well_uniq = well_uniq.drop_duplicates()
genes_uniq = genes.drop(['syn'],1)
genes_uniq = genes_uniq.drop_duplicates()

yome.index = range(0,len(yome))
well.index = range(0,len(well))
genes.index = range(0,len(genes))

In [None]:
len(yome), len(well), len(genes)

In [None]:
len(yome_uniq), len(well_uniq), len(genes_uniq)

# Focus on all Experiments

## Distribution and Stats (Y-ome vs. non Y-ome)

In [None]:
allfit['bnum'] = all_bnums
all_t['bnum'] = all_bnums

In [None]:
allfit_y1 = allfit[allfit.bnum.isin(yome.bnum)]
allfit_y2 = allfit[allfit.bnum.isin(yome.syn)]
allfit_y = pd.concat([allfit_y1, allfit_y2],0)
allfit_y = allfit_y.drop_duplicates()

allt_y1 = all_t[all_t.bnum.isin(yome.bnum)]
allt_y2 = all_t[all_t.bnum.isin(yome.syn)]
allt_y = pd.concat([allt_y1, allt_y2],0)
allt_y = allt_y.drop_duplicates()

allfit_w1 = allfit[allfit.bnum.isin(well.bnum)]
allfit_w2 = allfit[allfit.bnum.isin(well.syn)]
allfit_w = pd.concat([allfit_w1, allfit_w2],0)
allfit_w = allfit_w.drop_duplicates()

allt_w1 = all_t[all_t.bnum.isin(well.bnum)]
allt_w2 = all_t[all_t.bnum.isin(well.syn)]
allt_w = pd.concat([allt_w1, allt_w2],0)
allt_w = allt_w.drop_duplicates()

In [None]:
#Melt function takes all entries and makes them 1 column so that histograms can be created
allfit_y_stack = pd.melt(allfit_y, id_vars = ['bnum'])
allfit_w_stack = pd.melt(allfit_w, id_vars = ['bnum'])
allt_y_stack = pd.melt(allt_y, id_vars = ['bnum'])
allt_w_stack = pd.melt(allt_w, id_vars = ['bnum'])

In [None]:
allfit_y_stack.value.plot.hist(title = 'Y-ome Log2Fitness Across All Conditions')

In [None]:
allfit_w_stack.value.plot.hist(title = 'non Y-ome Log2Fitness Across All Conditions')

In [None]:
allt_y_stack.value.plot.hist(title = 'Y-ome t-test like statistic Across All Conditions')

In [None]:
allt_w_stack.value.plot.hist(title = 'non Y-ome t-test like statistic Across All Conditions')

In [None]:
#definitions to make percentage effect calculations easier
numy = len(allfit_y_stack)
numw = len(allfit_w_stack)
posy = len(allfit_y_stack[allfit_y_stack.value > 0])
negy = len(allfit_y_stack[allfit_y_stack.value < 0])
posw = len(allfit_w_stack[allfit_w_stack.value > 0])
negw = len(allfit_w_stack[allfit_w_stack.value < 0])

In [None]:
100*posy/numy, 100*negy/numy # % of Y-ome KOs that had pos fitness effect vs. % that had negative effects

In [None]:
100*posw/numw, 100*negw/numw # % of non Y-ome KOs that had pos fitness effect vs. % that had negative effects

# Focus on SpecPhes

## Number of Genes w/ Specphes (Y-ome vs. non Y-ome)

In [None]:
pheny2 = phen[phen.b_num.isin(yome.bnum)]
pheny1 = phen[phen.b_num.isin(yome.syn)]
pheny = pd.concat([pheny1, pheny2],0)
pheny = pheny.drop_duplicates()

phenw1 = phen[phen.b_num.isin(well.bnum)]
phenw2 = phen[phen.b_num.isin(well.syn)]
phenw = pd.concat([phenw1, phenw2],0)
phenw = phenw.drop_duplicates()

bnumsy = pd.DataFrame(pheny.b_num)           
bnumsw = pd.DataFrame(phenw.b_num)
unique_bnumsy = bnumsy.drop_duplicates()
bnumsy.index = range(0,len(bnumsy))
unique_bnumsy.index = range(0, len(unique_bnumsy))
unique_bnumsw = bnumsw.drop_duplicates()
bnumsw.index = range(0,len(bnumsw))
unique_bnumsw.index = range(0, len(unique_bnumsw))

In [None]:
len(unique_bnumsy), len(unique_bnumsw) # (# of yome genes that have specphes, # of well annotated genes that have specphes)

## Distribution of specphes/Gene (Y-ome vs. non Y-ome)

In [None]:
countsy = bnumsy.b_num.value_counts()
countsw = bnumsw.b_num.value_counts()

In [None]:
countsy.plot.hist(title = 'Specphes/Gene (Y-ome)')

In [None]:
countsw.plot.hist(title = 'Specphes/Gene (Non Y-ome)')

## Specphe Lrn Histograms and Distribution Stats (Y-ome vs. non Y-ome)

In [None]:
pheny.lrn.plot.hist(title = 'Log2Fitness of Y-ome in Specphes')

In [None]:
phenw.lrn.plot.hist(title = 'Log2Fitness of non Y-ome in Specphes')

In [None]:
100*(len(pheny[pheny.lrn > 0])/ len(pheny)), 100*(len(pheny[pheny.lrn < 0])/len(pheny)) #(postive KO effect%, negative KO effect%) for Y-ome 

In [None]:
100*(len(phenw[phenw.lrn > 0])/ len(phenw)), 100*(len(phenw[phenw.lrn < 0])/len(phenw)) #(postive KO effect%, negative KO effect%) for Y-ome 

# Pulling up SpecPhes for Bnums

In [None]:
#query = ['b1451']
#goi = phen[phen.b_num==query] #use with query line up above to look up specific bnums

goi = phen[phen.b_num.isin(unique_bnumsy.b_num)] #for groups of bnums (in this case for Y-ome genes with SpecPhes)
goi

In [None]:
coi = meta[meta.name.isin(goi.name)]
coi[['name', 'Description', 'Condition_1', 'Condition_2','Media', 'Growth.Method', 'Temperature', 'pH', 'Timecourse.Sample']]

In [None]:
coi

## Determining which conditions are SpecPhes for the most Y-ome genes (as well as how many of them have positive vs. negative fitness effects)

In [None]:
goi['Condition_1'].value_counts() #conditions and how many Y-ome genes have them as their specific phenotype

In [None]:
goi_pos = goi[goi.lrn > 0]
goi_neg = goi[goi.lrn < 0]

In [None]:
goi_pos['Condition_1'].value_counts() # Specphes for Y-ome KOs that have a positive effect on fitness

In [None]:
goi_neg['Condition_1'].value_counts() # Specphes for Y-ome KOs that have a negative effect on fitness

In [None]:
bac = goi_neg[goi_neg['Group'] == 'nitrogen source']

In [None]:
bac.b_num.value_counts()

In [None]:
bac[bac.b_num=='b4256']