In [1]:
import pandas as pd
import numpy as np

In [27]:
import csv
import math

def readDataFromCSV(filename):
    dict_list = []
    reader = csv.DictReader(open(filename, 'rb'))
    for line in reader:
        dict_list.append(line)
    return dict_list

# Write a "Feature - Category - Value" type dictionary to a CSV
def writeEpsilonsDataToCSV(filename, epsdict):
    with open(filename, 'w') as file:
        for feature in epsdict.keys():
            for category in epsdict[feature].keys():
                epsilon = epsdict[feature][category]["epsilon"]
                nx = epsdict[feature][category]["nx"]
                nc = epsdict[feature][category]["nc"]
                ncx = epsdict[feature][category]["ncx"]
                file.write("%s,%s,%s,%s,%s,%s\n"%(feature, category, epsilon, nx, nc, ncx))

def writeDictDataToCSV(filename, dict):
    csv_columns = getAllFeatures(dict)
    with open(filename, 'w') as file:
        writer = csv.DictWriter(file, fieldnames = csv_columns)
        writer.writeheader()
        for row in dict:
            writer.writerow(row)

def writeCompleteDictDataToCSV(filename, dict):
    csv_columns = getAllFeatures(dict)
    csv_columns.append(c_class)
    with open(filename, 'w') as file:
        writer = csv.DictWriter(file, fieldnames = csv_columns)
        writer.writeheader()
        for row in dict:
            writer.writerow(row)

def getAllFeatures(data):
    features = []
    for feature in data[0]:
        if feature != c_class:
            features.append(feature)
    return features

def getFeatures(data):
    features = []
    for feature in data[0]:
        if feature != 'FID' and feature != c_class:
            features.append(feature)
    return features

def getCategories(feature, data):
    mincount = int(data[0][feature])
    maxcount = int(data[0][feature])
    for id in range(len(data)):
        current = int(data[id][feature])
        if current > maxcount:
            maxcount = current
        if current < mincount:
            mincount = current
    return mincount, maxcount

def getCategoriesT(feature, data):
    cat_dict = {}
    for i in range(len(data)):
        current = str(data[i][feature])
        cat_dict[current] = "cat"
    return cat_dict.keys()

def getNX(feature, category, data):
    count = 0
    for i in range(len(data)):
        if str(category) == str(data[i][feature]):
            count = count + 1
    #fc = str(feature) + ' (Cat: ' + str(category) + ')'
    #print fc + ': ' + str(count)
    return count

def getNCX(feature1, category1, feature2, category2, data):
    count = 0
    for i in range(len(data)):
        if str(category1) == str(data[i][feature1]) and str(category2) == str(data[i][feature2]):
            count = count + 1
    #fc1 = str(feature1) + ' (Cat: ' + str(category1) + ')'
    #fc2 = str(feature2) + ' (Cat: ' + str(category2) + ')'
    #print fc1 + ' AND ' + fc2 + ': ' + str(count)
    return count

def getAllEpsilons(data):
    features = getFeatures(data)
    epsilons = {}
    for feature in features:
        epsilons[feature] = getEpsilonsFromFeature(feature, data)
    return epsilons

def getEpsilonsFromFeature(feature, data):
    categories = getCategoriesT(feature, data)
    epsilons = {}
    for category in categories:
        values = getEpsilon(feature, category, data)
        epsilons[category] = values
    return epsilons

def getEpsilon(feature, category, data):
    n = len(data)
    nx = getNX(feature, category, data)
    nc = getNX(c_class, c_category, data)
    ncx = getNCX(feature, category, c_class, c_category, data)
    if n != 0 and nx != 0 and nc != 0:
        pc = nc / float(n)
        pcx = ncx / float(nx)
        epsilon = nx * (pcx - pc) / math.sqrt(nx * pc * (1 - pc))
    else:
        epsilon = 0
        
    values = {"epsilon": epsilon, "nx": nx, "nc": nc, "ncx": ncx}
    #print 'Epsilon :' + str(epsilon)
    return values

#training_data = readDataFromCSV('datos_completos.csv')
#test_data = readDataFromCSV('datos_completos.csv')
#all_data = readDataFromCSV('datos_completos.csv')

In [67]:
ant_pers = pd.read_csv('ant_pers.csv', index_col = 'an_folio')
snps = readDataFromCSV('snps.csv')

In [90]:
completeDictionary = []

# Create a dictionary with the SNPs and all the relevant data
for pid in snps:
    try:
        pid["Obesidad"] = ant_pers.loc[int(pid["FID"])][0]
        pid["Acintura"] = ant_pers.loc[int(pid["FID"])][1]
        pid["Abrazo"] = ant_pers.loc[int(pid["FID"])][2]
        pid["estres5"] = ant_pers.loc[int(pid["FID"])][3]
        pid["Ainsulina"] = ant_pers.loc[int(pid["FID"])][4]
        pid["glu_com"] = ant_pers.loc[int(pid["FID"])][5]
        pid["uric_com"] = ant_pers.loc[int(pid["FID"])][6]
        completeDictionary.append(pid)
    except KeyError as e:
        print e
    
newDict = {}

for key in pid.keys():
    newDict[key] = []
    
for entry in completeDictionary:
    for key in pid.keys():
        newDict[key].append(entry[key])

u'the label [16] is not in the [index]'
u'the label [444] is not in the [index]'
u'the label [511] is not in the [index]'


In [94]:
df = pd.DataFrame.from_dict(newDict)
df.to_csv(path_or_buf='snpswvars.csv', index=False)

In [98]:
writeEpsilonsDataToCSV('epsilons_test.csv', eps)

In [37]:
c_class = 'Abrazo'
c_category = 5
datafile = 'snpswvars.csv'

training_data = readDataFromCSV(datafile)
test_data = readDataFromCSV(datafile)
all_data = readDataFromCSV(datafile)
    
eps = getAllEpsilons(all_data)
writeEpsilonsDataToCSV('epsilons_snps_brazo.csv', eps)

In [38]:
c_class = 'Acintura'
c_category = 5
datafile = 'snpswvars.csv'

training_data = readDataFromCSV(datafile)
test_data = readDataFromCSV(datafile)
all_data = readDataFromCSV(datafile)
    
eps = getAllEpsilons(all_data)
writeEpsilonsDataToCSV('epsilons_snps_cintura.csv', eps)

In [39]:
c_class = 'estres5'
c_category = 1
datafile = 'snpswvars.csv'

training_data = readDataFromCSV(datafile)
test_data = readDataFromCSV(datafile)
all_data = readDataFromCSV(datafile)
    
eps = getAllEpsilons(all_data)
writeEpsilonsDataToCSV('epsilons_snps_estres5.csv', eps)

In [40]:
c_class = 'Ainsulina'
c_category = 5
datafile = 'snpswvars.csv'

training_data = readDataFromCSV(datafile)
test_data = readDataFromCSV(datafile)
all_data = readDataFromCSV(datafile)
    
eps = getAllEpsilons(all_data)
writeEpsilonsDataToCSV('epsilons_snps_insulina.csv', eps)

In [41]:
c_class = 'glu_com'
c_category = 'ALTO'
datafile = 'snpswvars.csv'

training_data = readDataFromCSV(datafile)
test_data = readDataFromCSV(datafile)
all_data = readDataFromCSV(datafile)
    
eps = getAllEpsilons(all_data)
writeEpsilonsDataToCSV('epsilons_snps_glucosa.csv', eps)

In [42]:
c_class = 'uric_com'
c_category = 'ALTO'
datafile = 'snpswvars.csv'

training_data = readDataFromCSV(datafile)
test_data = readDataFromCSV(datafile)
all_data = readDataFromCSV(datafile)
    
eps = getAllEpsilons(all_data)
writeEpsilonsDataToCSV('epsilons_snps_acidourico.csv', eps)

In [66]:
# Get the epsilons for the selected list of snps
inters = pd.read_csv('Intersection.csv')
vinters = inters.values
fl = []
for snp in range(vinters.shape[0]):
    fl.append(vinters[snp][0])
    
def getSpecificEpsilons(feature_list, data):
    epsilons = {}
    for feature in feature_list:
        epsilons[feature] = getEpsilonsFromFeature(feature, data)
    return epsilons
    
    
c_class = 'Obesidad'
c_category = 1
datafile = 'snpsobes.csv'

training_data = readDataFromCSV(datafile)
test_data = readDataFromCSV(datafile)
all_data = readDataFromCSV(datafile)

eps = getSpecificEpsilons(fl, all_data)
writeEpsilonsDataToCSV('epsilons_intersection_snps.csv', eps)