In [3]:
import pandas as pd

### Import Headers and Residue Data from Henry's Data Extraction and Formatting

In [38]:
headers = pd.read_csv('headers_1000.csv')
headers = headers.set_index('Protein')
headers.head()

Unnamed: 0_level_0,name,head,structure_method,resolution,has_missing_residues
Protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2y39,ni-bound form of cupriavidus metallidurans ch34 cnrxs,metal binding protein,x-ray diffraction,1.41,True
2o73,structure of ohcu decarboxylase in complex with allantoin,lyase,x-ray diffraction,1.8,True
3d5m,crystal structure of hcv ns5b polymerase with a novel pyridazinone inhibitor,transferase,x-ray diffraction,2.2,True
1gey,crystal structure of histidinol-phosphate aminotransferase complexed with n-(5'-phosphopyridoxyl)-l-glutamate,transferase,x-ray diffraction,2.3,True
4y79,factor xa complex with gtc000406,hydrolase,x-ray diffraction,2.1,True


In [10]:
res_data = pd.read_csv('res_data_1000.csv', dtype='str')
res_data.head()

Unnamed: 0,2y39,2o73,3d5m,1gey,4y79,3gem,2z91,2jfk,3ueo,3qun,...,2qqt,7fdr,2x82,5uez,2hi2,1gzc,4msw,4bal,4wji,1ufo
0,GLY,ASP,SER,THR,ILE,SER,GLN,GLN,GLY,SER,...,SER,ALA,PRO,SER,THR,VAL,GLN,ALA,ALA,ARG
1,ASP,ILE,MET,ILE,VAL,ALA,LEU,SER,LEU,GLY,...,TRP,PHE,VAL,HIS,LEU,GLU,VAL,THR,GLN,VAL
2,LEU,ASN,SER,THR,GLY,PRO,LEU,MET,PHE,LEU,...,GLU,VAL,GLN,MET,ILE,THR,GLN,PHE,GLN,ARG
3,HIS,VAL,TYR,ASP,GLY,ILE,GLU,ARG,SER,VAL,...,VAL,VAL,HIS,GLU,GLU,ILE,LEU,GLU,PHE,THR
4,GLU,VAL,THR,LEU,GLN,LEU,SER,LEU,GLN,PRO,...,GLY,THR,VAL,GLN,LEU,SER,GLN,ILE,GLN,GLU


### Count total sequence length and fraction of electrically charged, unpolar, hydrophobic, and special case residues for each protein 
[See reference here for selection of amino acids groups](https://www.technologynetworks.com/applied-sciences/articles/essential-amino-acids-chart-abbreviations-and-structure-324357)

In [13]:
# keep index, width, and length from residue data frame
index = res_data.columns
width = len(index)
length = len(res_data)

In [16]:
# count sequence length for each protein entry
sequence_length = []

for i in range(width):
    sequence_length.append(res_data.iloc[:,i].count())

In [17]:
# count number of electrically charged residues in each protein entry (ARG, HIS, LYS, ASP, GLU)
electrically_charged = []

for j in range(width):
    sum_e_charged = 0
    for i in range(length):
        if res_data.iloc[i, j] == 'ARG':
            sum_e_charged += 1
        elif res_data.iloc[i, j] == 'HIS':
            sum_e_charged += 1
        elif res_data.iloc[i, j] == 'LYS':
            sum_e_charged += 1
        elif res_data.iloc[i, j] == 'ASP':
            sum_e_charged += 1
        elif res_data.iloc[i, j] == 'GLU':
            sum_e_charged += 1
    electrically_charged.append(sum_e_charged)

In [15]:
# count number of unpolar residues in each protein entry (SER, THR, ASN, GLN)
unpolar = []

for j in range(width):
    sum_unpolar = 0
    for i in range(length):
        if res_data.iloc[i, j] == 'SER':
            sum_unpolar += 1
        elif res_data.iloc[i, j] == 'THR':
            sum_unpolar += 1
        elif res_data.iloc[i, j] == 'ASN':
            sum_unpolar += 1
        elif res_data.iloc[i, j] == 'GLN':
            sum_unpolar += 1
    unpolar.append(sum_unpolar)

In [18]:
# count number of hydrophobic residues in each protein entry (ALA, VAL, ILE, LEU, MET, PHE, TYR, TRP)
hydrophobic = []

for j in range(width):
    sum_hydrophobic = 0
    for i in range(length):
        if res_data.iloc[i, j] == 'ALA':
            sum_hydrophobic += 1
        elif res_data.iloc[i, j] == 'VAL':
            sum_hydrophobic += 1
        elif res_data.iloc[i, j] == 'ILE':
            sum_hydrophobic += 1
        elif res_data.iloc[i, j] == 'LEU':
            sum_hydrophobic += 1
        elif res_data.iloc[i, j] == 'MET':
            sum_hydrophobic += 1
        elif res_data.iloc[i, j] == 'PHE':
            sum_hydrophobic += 1
        elif res_data.iloc[i, j] == 'TYR':
            sum_hydrophobic += 1
        elif res_data.iloc[i, j] == 'TRP':
            sum_hydrophobic += 1
    hydrophobic.append(sum_hydrophobic)

In [19]:
# count number of special case residues (CIS, GLY, PRO)
special = []

for j in range(width):
    sum_special = 0
    for i in range(length):
        if res_data.iloc[i, j] == 'CIS':
            sum_special += 1
        elif res_data.iloc[i, j] == 'GLY':
            sum_special += 1
        elif res_data.iloc[i, j] == 'PRO':
            sum_special += 1
    special.append(sum_special)

In [28]:
# calculate the fraciton of each group per total sequence length
fraction_electrically_charged = []
fraction_unpolar = []
fraction_hydrophobic = []
fraction_special = []

for i in range(width):
    fraction_electrically_charged.append(electrically_charged[i]/sequence_length[i])
    fraction_unpolar.append(unpolar[i]/sequence_length[i])
    fraction_hydrophobic.append(hydrophobic[i]/sequence_length[i])
    fraction_special.append(special[i]/sequence_length[i])

# create dataframe that includes protein properties
properties = pd.DataFrame({'Sequence Length': sequence_length, 'Electrically Charged': fraction_electrically_charged, 
                            'Nonpolar Side Chains': fraction_unpolar, 'Hydrophobic': fraction_hydrophobic, 
                            'Special': fraction_special}, index=index, dtype='int')

  
  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


In [30]:
properties.head()

Unnamed: 0,Sequence Length,Electrically Charged,Nonpolar Side Chains,Hydrophobic,Special
2y39,110,0.381818,0.109091,0.418182,0.0909091
2o73,992,0.302419,0.18246,0.40625,0.078629
3d5m,1116,0.232975,0.218638,0.40681,0.107527
1gey,335,0.21194,0.2,0.453731,0.107463
4y79,287,0.275261,0.205575,0.337979,0.125436


In [33]:
properties.dtypes

Sequence Length         int32 
Electrically Charged    object
Nonpolar Side Chains    object
Hydrophobic             object
Special                 object
dtype: object

### Create a summary dataframe which includes headers and properties

In [39]:
summary = pd.concat([headers, properties], axis=1, sort=False)
summary.head()

Unnamed: 0_level_0,name,head,structure_method,resolution,has_missing_residues,Sequence Length,Electrically Charged,Nonpolar Side Chains,Hydrophobic,Special
Protein,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2y39,ni-bound form of cupriavidus metallidurans ch34 cnrxs,metal binding protein,x-ray diffraction,1.41,True,110,0.381818,0.109091,0.418182,0.0909091
2o73,structure of ohcu decarboxylase in complex with allantoin,lyase,x-ray diffraction,1.8,True,992,0.302419,0.18246,0.40625,0.078629
3d5m,crystal structure of hcv ns5b polymerase with a novel pyridazinone inhibitor,transferase,x-ray diffraction,2.2,True,1116,0.232975,0.218638,0.40681,0.107527
1gey,crystal structure of histidinol-phosphate aminotransferase complexed with n-(5'-phosphopyridoxyl)-l-glutamate,transferase,x-ray diffraction,2.3,True,335,0.21194,0.2,0.453731,0.107463
4y79,factor xa complex with gtc000406,hydrolase,x-ray diffraction,2.1,True,287,0.275261,0.205575,0.337979,0.125436


In [47]:
summary.to_csv('summary_1000.csv', index=True)

In [46]:
check = pd.read_csv('summary_1000.csv')
check.head()

Unnamed: 0,Protein,name,head,structure_method,resolution,has_missing_residues,Sequence Length,Electrically Charged,Nonpolar Side Chains,Hydrophobic,Special
0,2y39,ni-bound form of cupriavidus metallidurans ch34 cnrxs,metal binding protein,x-ray diffraction,1.41,True,110,0.381818,0.109091,0.418182,0.090909
1,2o73,structure of ohcu decarboxylase in complex with allantoin,lyase,x-ray diffraction,1.8,True,992,0.302419,0.18246,0.40625,0.078629
2,3d5m,crystal structure of hcv ns5b polymerase with a novel pyridazinone inhibitor,transferase,x-ray diffraction,2.2,True,1116,0.232975,0.218638,0.40681,0.107527
3,1gey,crystal structure of histidinol-phosphate aminotransferase complexed with n-(5'-phosphopyridoxyl)-l-glutamate,transferase,x-ray diffraction,2.3,True,335,0.21194,0.2,0.453731,0.107463
4,4y79,factor xa complex with gtc000406,hydrolase,x-ray diffraction,2.1,True,287,0.275261,0.205575,0.337979,0.125436
