In [143]:
import pandas as pd
import os
import ast
import numpy
from Bio.SeqUtils.ProtParam import ProteinAnalysis
pd.options.display.max_colwidth = 100

In [100]:
all_results = []
for f in os.listdir('.'):
    if '.data' in f:
        with open(f, 'r') as g:
            all_results.append(ast.literal_eval(g.read()))

In [86]:
def standardize_to_uM(concentration, unit, sequence):
    concentration = concentration.replace(' ', '')
    try:
        concentration = float(concentration)
    except:
        return None
    if unit == 'uM' or unit == u'\xb5M' or unit == u'uM)':
        return concentration
    elif unit == 'ug/ml' or unit == u'\xb5g/ml' or unit == u'ug/ml)':
        try:
            molWt = ProteinAnalysis(sequence).molecular_weight()
        except ValueError:
            return None
        return concentration * 1000/molWt
    elif unit == 'nmol/g' or unit == 'pmol/mg':
        #1g, at density of 1g/mL, is 1mL, so nmol/g is nmol/mL = umol/L = uM yay!
        return concentration
    else:
        # print 'Unit not recognized: ' + unit
        return None

In [4]:
columns = ['sequence', 'url_source', 'bacteria', 'strain', 'value', 'unit']

In [5]:
df = pd.DataFrame(columns=columns)

In [10]:
sequence = 'GLPRKILCAIAKKKGKCKGPLKLVCKC'

In [11]:
all_results[0][sequence]

{'bacteria': {(u'B. subtilis', None): {'unit': u'uM)', 'value': u'0.4'}},
 'modifications': ['disulfide'],
 'reference': u'Amino Acids. 2012 Aug;43(2):751-61. PubMed. | http://www.ncbi.nlm.nih.gov/pubmed/22038181',
 'url_sources': ['http://aps.unmc.edu/AP/database/query_output.php?ID=1958']}

In [173]:
def is_modified(result):
    if 'modifications' not in result:
        return False
    else:
        return any(['C-Term' not in modification for modification in result['modifications']])

In [174]:
def convert_result_to_rows(sequence, result):
    rows = []
    if 'bacteria' not in result:
        return rows
    for bacterium, strain in result['bacteria']:
        
        rows.append({
            'bacterium': bacterium,
            'strain': strain,
            'sequence': sequence.upper(),
            'url_source': result['url_sources'][0],
            'value': standardize_to_uM(
                result['bacteria'][(bacterium, strain)]['value'],
                result['bacteria'][(bacterium, strain)]['unit'],
                sequence
            ),
            'is_modified': is_modified(result),
            'unit': 'uM'
        })
        if rows[-1]['value']:
            rows[-1]['value'] = numpy.log10(rows[-1]['value'])
    return rows

In [175]:
rows = []
for result_set in all_results:
    for sequence in result_set:
        for row in convert_result_to_rows(sequence, result_set[sequence]):
            rows.append(row)
    print("Finished result set")

Finished result set
Finished result set
Finished result set
Finished result set
Finished result set
Finished result set
Finished result set


In [176]:
df = pd.DataFrame(rows)

In [179]:
df = df.loc[df.is_modified == False]

In [240]:
staph = df.loc[df.bacterium.str.contains('S. aureus')].groupby('sequence')['value'].mean().dropna()
ecoli = df.loc[df.bacterium.str.contains('E. coli')].groupby('sequence')['value'].mean().dropna()
pseudomonas = df.loc[df.bacterium.str.contains('P. aeruginosa')].groupby('sequence')['value'].mean().dropna()
streptococcus = df.loc[df.bacterium.str.contains('S. mutans')].groupby('sequence')['value'].mean().dropna()
bacillus = df.loc[df.bacterium.str.contains('B. subtilis')].groupby('sequence')['value'].mean().dropna()

In [236]:
ecoli_staph = pd.concat([ecoli, staph], axis=1).reset_index()
ecoli_staph.columns = ['index', 'ecoli_value', 'staph_value']
ecoli_staph['ecoli_value'].corr(ecoli_staph['staph_value'])

0.67367977494727116

In [237]:
ecoli_pseudomonas = pd.concat([ecoli, pseudomonas], axis=1).reset_index().dropna()
ecoli_pseudomonas.columns = ['index', 'ecoli_value', 'pseudomonas_value']
ecoli_pseudomonas['ecoli_value'].corr(z['pseudomonas_value'])

0.77724751649817214

In [238]:
pseudomonas_staph = pd.concat([pseudomonas, staph], axis=1).reset_index().dropna()
pseudomonas_staph.columns = ['index', 'pseudomonas_value', 'staph_value']
pseudomonas_staph['staph_value'].corr(pseudomonas_staph['pseudomonas_value'])

0.6519294681149479

In [241]:
many_bacteria = pd.concat([ecoli, pseudomonas, streptococcus, staph, bacillus], axis=1).reset_index()
many_bacteria.columns = ['index', 'ecoli', 'pseudomonas', 'streptococcus', 'staph', 'bacillus']
many_bacteria.corr()

Unnamed: 0,ecoli,pseudomonas,streptococcus,staph,bacillus
ecoli,1.0,0.777248,0.714585,0.67368,0.682036
pseudomonas,0.777248,1.0,0.520216,0.651929,0.594575
streptococcus,0.714585,0.520216,1.0,0.825801,0.81689
staph,0.67368,0.651929,0.825801,1.0,0.665278
bacillus,0.682036,0.594575,0.81689,0.665278,1.0


In [242]:
len(bacillus)

1464