In [76]:
import pandas as pd
import numpy as np

## AAIndex 1

In [77]:
def conversion_aa1(data):
    "Converts raw AAIndex1 into useable Pandas DataFrame"
    
    # define column names and initialize dataframe
    col1 = ['Description']
    aa = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
          'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']
    columns = col1+aa
    df = pd.DataFrame(data=[], columns=columns)
    
    # conversion by parsing text file line by line
    with open(data) as f: 
        for i, line in enumerate(f):
            if line[0] == 'H':
                description = line.split()[1]
            if line[0] == 'I':
                tmp = i
            if 'tmp' in locals():
                if i == tmp+1:
                    tmp1 = [description]+line.split()
                if i == tmp+2:
                    tmp2 = line.split()
                    tmp_all = tmp1+tmp2
                    tmp_all = pd.DataFrame([tmp_all], columns=columns)
                    df = df.append([tmp_all]).reset_index(drop=True)    
    
    return df

In [78]:
# execute function
df_aa1 = conversion_aa1('data/aaindex1.txt')

In [86]:
df_aa1

Unnamed: 0,Description,A,R,N,D,C,Q,E,G,H,...,L,K,M,F,P,S,T,W,Y,V
0,ANDN920101,4.35,4.38,4.75,4.76,4.65,4.37,4.29,3.97,4.63,...,4.17,4.36,4.52,4.66,4.44,4.50,4.35,4.70,4.60,3.95
1,ARGP820101,0.61,0.60,0.06,0.46,1.07,0.,0.47,0.07,0.61,...,1.53,1.15,1.18,2.02,1.95,0.05,0.05,2.65,1.88,1.32
2,ARGP820102,1.18,0.20,0.23,0.05,1.89,0.72,0.11,0.49,0.31,...,3.23,0.06,2.67,1.96,0.76,0.97,0.84,0.77,0.39,1.08
3,ARGP820103,1.56,0.45,0.27,0.14,1.23,0.51,0.23,0.62,0.29,...,2.93,0.15,2.96,2.03,0.76,0.81,0.91,1.08,0.68,1.14
4,BEGF750101,1.,0.52,0.35,0.44,0.06,0.44,0.73,0.35,0.60,...,1.,0.60,1.,0.60,0.06,0.35,0.44,0.73,0.44,0.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
561,KARS160118,6.00,5.00,6.60,6.80,9.33,6.50,6.67,3.50,4.70,...,6.00,6.17,8.00,6.00,6.00,7.33,5.40,5.667,6.22,6.00
562,KARS160119,12.00,23.343,27.708,28.634,28.00,27.831,28.731,7.00,24.243,...,25.021,22.739,31.344,26.993,24.00,20.00,23.819,29.778,28.252,24.00
563,KARS160120,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,-1.734,...,0.00,-0.179,0.00,0.00,0.00,0.00,-4.227,0.211,-0.96,0.00
564,KARS160121,6.00,10.667,10.00,10.40,11.333,10.50,10.667,3.50,10.400,...,9.60,10.167,13.60,12.00,12.00,8.667,9.00,12.75,12.222,9.00


In [79]:
# write to csv
df_aa1.to_csv('aaindex1.csv', index=False)

## AAIndex 2

In [80]:
def conversion_aa2(data):
    "Converts raw AAIndex2 into useable Pandas DataFrame"

    # define column names 
    columns = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
               'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

    MAX_ROW = 22
    MAX_COL = 21
    INDICES = 94
    arr = np.zeros((MAX_ROW, MAX_COL, INDICES))
    cnt = -1
    all_desc = []

    with open(data) as f: 
        for i, line in enumerate(f):
            if line[0] == 'H':
                description = line.split()[1]
                all_desc.append(description)
                cnt += 1
            if line[0] == 'M':
                tmp = i
            if 'tmp' in locals():
                for aa in range(MAX_ROW):
                    if i == tmp+(aa+1):
                        tmp_arr = line.split()
                        # replacing dashes with NaN
                        tmp_arr = [e.replace("-", "NaN") if len(e) == 1 else e for e in tmp_arr]
                        try:
                            float(tmp_arr[0])
                            arr[aa,:len(tmp_arr),cnt] = tmp_arr
                        except ValueError:
                            pass

    rows = [str(x) for x in range(22)]
    cols = [str(x) for x in range(21)]

    ext_desc = [[all_desc[i]]*22 for i in range(INDICES)]
    flat_desc = [item for sublist in ext_desc for item in sublist]
    multind = pd.MultiIndex.from_arrays([flat_desc, rows*INDICES], names=['Description', 'Amino Acids'])

    # reshape 3D to 2D
    arr2D = arr.transpose(2,0,1).reshape(-1,arr.shape[1])

    df = pd.DataFrame({cols[i]: arr2D[:,i] for i in range(21)}, multind)

    return df

In [81]:
# execute function
df_aa2 = conversion_aa2('data/aaindex2.txt')

In [82]:
# write to csv
df_aa2.to_csv('aaindex2.csv')

## AAIndex 3

In [83]:
def conversion_aa3(data):
    "Converts raw AAIndex3 into useable Pandas DataFrame"
    
    # define column names 
    columns = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I',
               'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V']

    MAX_ROW = 20
    MAX_COL = 20
    INDICES = 47
    arr = np.zeros((MAX_ROW, MAX_COL, INDICES))
    cnt = -1
    all_desc = []
    
    # conversion by parsing text file line by line
    with open(data) as f: 
        for i, line in enumerate(f):
            if line[0] == 'H':
                description = line.split()[1]
                all_desc.append(description)
                cnt += 1
            if line[0] == 'M':
                tmp = i
            if 'tmp' in locals():
                for aa in range(MAX_ROW):
                    if i == tmp+(aa+1):
                        tmp_arr = line.split()
                        # replacing dashes with NaN
                        tmp_arr = [e.replace("-", "NaN") if len(e) == 1 else e for e in tmp_arr]
                        tmp_arr = [e.replace("NA", "NaN") for e in tmp_arr]
                        arr[aa,:len(tmp_arr),cnt] = tmp_arr

    ext_desc = [[all_desc[i]]*20 for i in range(INDICES)]
    flat_desc = [item for sublist in ext_desc for item in sublist]
    multind = pd.MultiIndex.from_arrays([flat_desc, columns*INDICES], names=['Description', 'Amino Acids'])

    # reshape 3D to 2D
    arr2D = arr.transpose(2,0,1).reshape(-1,arr.shape[1])

    df = pd.DataFrame({columns[i]: arr2D[:,i] for i in range(20)}, multind)

    return df

In [84]:
# execute function
df_aa3 = conversion_aa3('data/aaindex3.txt')

In [85]:
# write to csv
df_aa3.to_csv('aaindex3.csv')