In [2]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import networkx as nx
import itertools
from collections import Counter
from networkx.drawing.nx_agraph import graphviz_layout
import statistics
import seaborn as sns
from skbio.stats.composition import multiplicative_replacement
from skbio.stats.composition import clr
from tqdm import tqdm

## Build Strain Dictionarys for functional analysis of species within cohort

In [3]:
#Strain Dataframes
indianStrains = pd.read_csv('./DataFiles/Dataframes/strainDataframes/masterIndian_allStrains.csv').set_index("SampleName")
americanStrains = pd.read_csv('./DataFiles/Dataframes/strainDataframes/masterAmerican_allStrains.csv').set_index("SampleName")
europeanStrains = pd.read_csv('./DataFiles/Dataframes/strainDataframes/masterEuropean_allStrains.csv').set_index("SampleName")
japaneseStrains = pd.read_csv('./DataFiles/Dataframes/strainDataframes/masterJapanese_allStrains.csv').set_index("SampleName")

In [4]:
allStrains = pd.concat([indianStrains,americanStrains,europeanStrains,japaneseStrains]).copy()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  """Entry point for launching an IPython kernel.


In [5]:
accTaxDictDF = pd.read_csv("./DataFiles/Dataframes/Master_Rename.csv").set_index("Bacterial_Strain")
accTaxDictDF['Species'] = accTaxDictDF['Species'].astype(str)

accTaxDictDF2 = accTaxDictDF.copy()
for row in accTaxDictDF2.index:
    accTaxDictDF2.rename(index={row:str(row).replace(",","").replace("(","").replace(")","").replace("'","").replace(" ", '_').replace("[","").replace("]","").replace(".","")}, inplace=True)

## Pull Strain Proportions for each component

In [7]:
def buildCohortSpeciesStrainAbundance(strainDF):
    
    strainDF2 = strainDF.copy()
    strainDF2.replace(0.0, np.nan, inplace=True)
    strainDF2.dropna(axis=1, thresh=1, inplace=True)
    strainDF2.fillna(0.0, inplace=True)
    
    TempStrainDic = {accTaxDictDF2.at[str(k).replace(",","").replace("(","").replace(")","").replace("'","").replace(" ", '_').replace("[","").replace("]","").replace(".",""),'Species']:[] for k in strainDF2.columns}
    
    for column in strainDF2.columns:
        TempStrainDic[accTaxDictDF2.at[str(column).replace(",","").replace("(","").replace(")","").replace("'","").replace(" ", '_').replace("[","").replace("]","").replace(".",""),'Species']].append({column:strainDF2[column].mean()})
        
    newDictionary = {}
    for entry in TempStrainDic:
        
        newDictionary[entry] = []
        
        #Check if the old species only had 1 strain
        if int(len(TempStrainDic[entry])) == 1:
            for strain in TempStrainDic[entry]:
                for x,y in strain.items():
                    newDictionary[entry].append({x:1})
        
        #If the species has more than one strain
        if int(len(TempStrainDic[entry])) > 1:
            
            totalAbundance = 0.0
            for strain in TempStrainDic[entry]:
                for x,y in strain.items():
                    totalAbundance +=y

            for strain in TempStrainDic[entry]:
                for x,y in strain.items():
                    newDictionary[entry].append({x:(y/totalAbundance)})
                    
    return(newDictionary)

In [8]:
americanSpecies90 = pd.read_csv("./DataFiles/Dataframes/speciesDataframes/PrevFiltered/masterAmerican_90%Prev_Species.csv").set_index("SampleName")
indianSpecies90 = pd.read_csv("./DataFiles/Dataframes/speciesDataframes/PrevFiltered/masterIndian_90%Prev_Species.csv").set_index("SampleName")
europeanSpecies90 = pd.read_csv("./DataFiles/Dataframes/speciesDataframes/PrevFiltered/masterEuropean_90%Prev_Species.csv").set_index("SampleName")
japaneseSpecies90 = pd.read_csv("./DataFiles/Dataframes/speciesDataframes/PrevFiltered/masterJapanese_90%Prev_Species.csv").set_index("SampleName")

In [9]:
a90Species = list(americanSpecies90.columns)
i90Species = list(indianSpecies90.columns)
e90Species = list(europeanSpecies90.columns)
j90Species = list(japaneseSpecies90.columns)

In [10]:
americanStrainDic = buildCohortSpeciesStrainAbundance(americanStrains)
indianStrainDic = buildCohortSpeciesStrainAbundance(indianStrains)
europeanStrainDic = buildCohortSpeciesStrainAbundance(europeanStrains)
japaneseStrainDic = buildCohortSpeciesStrainAbundance(japaneseStrains)

## Find the Union of all Strains for Components

In [11]:
def pullStrains(listt):
    tempList = []
    for item in listt:
        if len(item) > 1:
            for key in item:
                for dictionary in key:
                    tempList.append(dictionary)
        else:
            for dictionary in (item):
                #print(dictionary)
                for x,y in dictionary.items():
                    tempList.append(x)
    return(tempList)

In [12]:
a90Strains = pullStrains([americanStrainDic[k] for k in a90Species])
i90Strains = pullStrains([indianStrainDic[k] for k in i90Species])
e90Strains = pullStrains([europeanStrainDic[k] for k in e90Species])
j90Strains = pullStrains([japaneseStrainDic[k] for k in j90Species])

In [14]:
def sendStrainsToFile(listt, filename):
    with open("./DataFiles/Dataframes/functionalAnalysis/cohortStrains/"+filename, "a+") as file:
        for x in listt:
            file.write(str(x).strip()+"\n")
    file.close()

In [15]:
#sendStrainsToFile(a90Strains, "american_Strains.txt")
#sendStrainsToFile(i90Strains, "indian_Strains.txt")
#sendStrainsToFile(e90Strains, "european_Strains.txt")
#sendStrainsToFile(j90Strains, "japanese_Strains.txt")

## Import Functional Potential DataFrame

In [16]:
TigrDF1 = pd.read_csv("./DataFiles/Dataframes/functionalAnalysis/master_TIGRBinary.csv")
TigrDF1.rename(columns={'Unnamed: 0':'Bacterial_Strains'}, inplace=True)
TigrDF1.set_index("Bacterial_Strains", inplace=True)
TigrDF2 = TigrDF1.div(TigrDF1.sum(axis=1), axis=0).copy()

In [17]:
def buildFunctionalSpeciesTigr(strainDict):
    
    testStrains = {}
    for entry in strainDict.values():
        for strain in entry:
            for x, y in strain.items():
                testStrains[str(x).replace(",","").replace("(","").replace(")","").replace("'","").replace(" ", '_').replace("[","").replace("]","").replace(".","")] = y
                
    newdf = TigrDF2.loc[testStrains].copy()
    
    for row in newdf.index:
        newdf.loc[row] = (newdf.loc[row]).mul(float(testStrains[row])).copy()
        
    strainRename = {k:accTaxDictDF2.at[k,'Species'] for k in list(testStrains.keys())}
    newdf.rename(index=strainRename, inplace=True)
    df5 = newdf.groupby(lambda x:x, axis=0).sum().copy()
    dfo2 = df5.div(df5.sum(axis=1), axis=0).copy()            

    return(dfo2)

In [18]:
americanSF_tigr = buildFunctionalSpeciesTigr(americanStrainDic)
indianSF_tigr = buildFunctionalSpeciesTigr(indianStrainDic)
europeanSF_tigr = buildFunctionalSpeciesTigr(europeanStrainDic)
japaneseSF_tigr = buildFunctionalSpeciesTigr(japaneseStrainDic)

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  if __name__ == '__main__':


In [19]:
#Check new species functionality df contains the same number of species as original dictionary
print(len(americanSF_tigr))
print(len(indianSF_tigr))
print(len(europeanSF_tigr))
print(len(japaneseSF_tigr))
print("\n")
print(len(americanStrainDic))
print(len(indianStrainDic))
print(len(europeanStrainDic))
print(len(japaneseStrainDic))

1030
1319
1351
1272


1030
1319
1351
1272


In [21]:
#americanSF_tigr.to_csv("./DataFiles/Dataframes/functionalAnalysis/americanSpecies_TIGRFAMs.csv")
#indianSF_tigr.to_csv("./DataFiles/Dataframes/functionalAnalysis/indianSpecies_TIGRFAMs.csv")
#europeanSF_tigr.to_csv("./DataFiles/Dataframes/functionalAnalysis/europeanSpecies_TIGRFAMs.csv")
#japaneseSF_tigr.to_csv("./DataFiles/Dataframes/functionalAnalysis/japaneseSpecies_TIGRFAMs.csv")