In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.set_option("future.no_silent_downcasting", True)

In [5]:
# Load Gatti_2014_geno.csv and treat '--' as an NAN value

geno = pd.read_csv('data/Gatti_2014_geno.csv', na_values = ['--'] )

# set_index to be 'SnpId' so that in the transpose 'SnpId's are the columns
geno.set_index('SnpId',inplace=True)

# Transpose geno
geno = geno.T


In [6]:
# Load Gatti_2014_pheno.csv

pheno = pd.read_csv('data/Gatti_2014_pheno.csv')

# Set index to 'Sample.ID' so that when we merge with geno they have the same index

pheno.set_index('Sample.ID',inplace=True)

In [7]:
# Add threshold columns to pheno

wbc_thr = 10.74
neut_thr = 1705

pheno['WBC_thr'] = (pheno['WBC'] > wbc_thr).astype(int)
pheno['NEUT_thr'] = (pheno['NEUT'] > neut_thr).astype(int)


In [12]:
# binary sub; columns are in the form {Snp_XY}

geno_binary = pd.get_dummies(geno, dtype = int)
geno_binary.head()

# save geno_binary dataframe as a csv file
geno_binary.to_csv('data/geno_binary.csv', index = False)

In [15]:
# Ternary sub: To get all data, set geno_count = geno. 
# When geno_count = geno, it doesn't finish running to code. Maybe too many columns? Let me know if you can make it run. 
# To see if the code work, I checked with the 4 columns in the excel sheet and got the same values.


# geno_count = geno
geno_count = geno[['JAX00240603','UNC010001397','UNC010515443','UNC010001943']]

for column in geno_count.columns:
    if 'TT' in geno_count[column].unique():
        geno_count = geno_count.replace({column: 'TT'},2)
        for string in {'AA','CC','GG'}:
            geno_count = geno_count.replace({column: string},0)
        for string in {'AT', 'TA', 'CT', 'TC', 'GT', 'TG'}:
            geno_count = geno_count.replace({column: string},1)
        continue
    if 'GG' in geno_count[column].unique():
        geno_count = geno_count.replace({column: 'GG'},2)
        for string in {'AA','CC'}:
            geno_count = geno_count.replace({column: string},0)
        for string in {'AG', 'GA', 'CG', 'GC'}:
            geno_count = geno_count.replace({column: string},1)
        continue
    if 'CC' in geno_count[column].unique():
        geno_count = geno_count.replace({column: 'CC'},2)
        for string in {'AA'}:
            geno_count = geno_count.replace({column: string},0)
        for string in {'AC', 'CA'}:
            geno_count = geno_count.replace({column: string},1)
        continue
    if 'AA' in geno_count[column].unique():
        geno_count = geno_count.replace({column: 'AA'},2) 

# geno_count.to_csv('data/geno_count.csv')

geno_count.head()


SnpId,JAX00240603,UNC010001397,UNC010515443,UNC010001943
JAXW202.2129,1,2,2,
JAXW202.2130,2,2,2,0.0
JAXW202.2132,0,2,1,2.0
JAXW202.2133,1,2,1,0.0
JAXW202.2134,1,2,2,


In [16]:
# Merge two dataframes with the same index

augmented = pd.concat([geno,pheno],axis=1)
augmented.to_csv('data/augmented.csv')
augmented.head()

Unnamed: 0,JAX00240603,UNC010001397,UNC010515443,UNC010001943,UNC010515539,UNC010515556,UNC010002207,UNC010003148,UNC010516347,UNC010518203,...,UNC210001613,JAX00725096,JAX00725100,JAX00725105,Sex,Batch,WBC,NEUT,WBC_thr,NEUT_thr
JAXW202.2129,TC,TT,GG,,AG,GG,GG,TT,GG,GG,...,GG,TT,TT,AA,F,13,4.34,598.052,0,0
JAXW202.2130,TT,TT,GG,AA,AG,GG,GG,TT,AG,GG,...,GG,CC,TT,GG,F,13,4.84,1001.88,0,0
JAXW202.2132,CC,TT,AG,CC,AG,AG,GG,TT,GG,GG,...,GG,CC,CC,AA,F,13,6.28,1073.88,0,0
JAXW202.2133,TC,TT,AG,AA,AG,AA,GG,TT,GG,GG,...,GG,TT,TT,AA,F,13,4.72,1093.152,0,0
JAXW202.2134,TC,TT,GG,,GG,AG,GG,TT,AA,GG,...,GG,CC,CC,AA,F,13,3.12,313.872,0,0
