In [25]:
import math
import numpy as np
import pandas as pd

In [32]:
fdata = pd.read_csv('data_histories.csv', index_col="dp_folio")
fdata.head()

Unnamed: 0_level_0,id_sexo,Aedad,AAedad,Apuesto,id_gestud,AIMC,fuma,fuma_act,ejer_act,ejer1,...,locout5,locout10,locout20,locout30,rest_act,rest1,rest5,rest10,rest20,rest30
dp_folio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,F,51,6,Admin,CarTec,4,1,3.0,0.0,2.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,F,38,4,Sec,Bach,3,2,,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0
3,F,34,3,Int,Sec,5,1,1.0,0.0,0.0,...,1.0,,,,0.0,0.0,0.0,,,
4,M,63,8,Jef,CarTec,4,2,,2.0,2.0,...,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0
5,M,42,4,EM,Sec,3,1,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# Query the values of an id
# dataframe.loc[id]
fdata.shape[1]

# If we turn this into a dictionary, then a query like dict[feature][folio] would give us a category
#ddata = fdata.to_dict()
#ddata['AAedad']

# iterate over all the rows in the dataframe
#for index in fdata.index:
#    print index

fdata.loc[1]['AIMC']

4

In [34]:
# Functions to count ocurrences for a category of a feature (NX)

# Count the number of instances inside the database whose feature X = category
# Ex: getNX('AIMC', 3, fdata)
def getNX(feature, category, data):
    count = 0
    for index in data.index:
        if data.loc[index][feature] == category:
            count = count + 1
    return count

# Count the number of instances inside the database whose feature X_1 = category_1 and feature X_2 = category_2
# Ex: getNCX('AIMC', 3, 'Obesidad', 1, fdata)
def getNCX(feature_1, category_1, feature_2, category_2, data):
    count = 0
    for index in data.index:
        if data.loc[index][feature_1] == category_1 and data.loc[index][feature_2] == category_2:
            count = count + 1
    return count

# Get a conditional probability P(F_1 = C_1 | F_2 = C_2)
def getCondProb(feature_1, category_1, feature_2, category_2, data):
    ncx = getNCX(feature_1, category_1, feature_2, category_2, data)
    nx = getNX(feature_2, category_2, data)
    if nx != 0:
        p = ncx / float(nx)
    else:
        p = 0
    return p

def getEpsilon(feature, category, classFeature, classCategory, data):
    n = len(data)
    nx = getNX(feature, category, data)
    nc = getNX(classFeature, classCategory, data)
    ncx = getNCX(feature, category, classFeature, classCategory, data)
    if n != 0 and nx != 0:
        pc = nc / float(n)
        pcx = ncx / float(nx)
        epsilon = nx * (pcx - pc) / math.sqrt(nx * pc * (1 - pc))
    else:
        epsilon = 0
    #print 'Epsilon :' + str(epsilon)
    return {'epsilon': epsilon, 'nx': nx, 'ncx': ncx, 'nc': nc}

In [36]:
# Functions to get the history of a certain individual 
# The given feature must be a history-based parameter inside the database (e.g. salud, estres, condi, etc.)
def getRawHistory(index, feature, data):
    history = {}
    #history['index'] = index
    sufix_list = ['_act', '1', '5', '10', '20', '30']
    for sufix in sufix_list:
        history[feature + sufix] = data.loc[index][feature + sufix]
    return history

def binarizeHistory(history, conditionA, conditionB):
    for feature in history:
        if conditionA(history[feature]):
            history[feature] = 'A'
        elif conditionB(history[feature]):
            history[feature] = 'B'
        else:
            history[feature] = 'N'
    return history

# Set of auxiliary high-order functions that will evaluate the conditions to binarize a history
def lessThan(num):
    return lambda n: n < num

def lessQThan(num):
    return lambda n: n <= num

def greaterThan(num):
    return lambda n: n > num

def greaterQThan(num):
    return lambda n: n >= num

def between(num1, num2):
    return lambda n: n <= num2 and n >= num1

# Examples of use
history = getRawHistory(32, 'condi', fdata)
print history
binarizeHistory(history, lessThan(3), greaterQThan(3))

{'condi_act': 4, 'condi20': 2, 'condi5': 2, 'condi10': 2, 'condi1': 3, 'condi30': 7}


{'condi1': 'B',
 'condi10': 'A',
 'condi20': 'A',
 'condi30': 'B',
 'condi5': 'A',
 'condi_act': 'B'}

In [37]:
# Binary History Class definition
class BinaryHistory:
    def __init__(self, index, feature, history):
        self.index = index
        self.feature = feature
        self.history = history
        self.structure = self.getHistoryStructure(history)
        self.complete = self.isHistoryComplete(history)
        self.length = self.getHistoryLength(history)
    
    def isHistoryComplete(self, history):
        return True
    
    def getHistoryLength(self, history):
        return 0
    
    def getHistoryStructure(self, history):
        time_list = ['30', '20', '10', '5','1','_act']
        structure = ''
        for t in time_list:
            structure = structure + self.history[self.feature + t]
        return structure
    
    def historyStructureEqualsTo(self, structure):
        for i in range(len(structure)):
            if self.structure[i] != structure[i] and structure[i] != '*':
                return False
        return True
    
# Test cases
test_index = 1
history = getRawHistory(test_index, 'condi', fdata)
bhistory = binarizeHistory(history, lessThan(3), between(3, 6))

h1 = BinaryHistory(test_index, 'condi', bhistory)
print h1.structure
h1.historyStructureEqualsTo('B*B*BA')

BBBBBA


True

In [38]:
fdata.shape

(1076, 72)

In [39]:
# Create a dictionary of binary histories for every individual in the database
# The feature must be a history-based parameter7
# conditionA and conditionB are the conditions of the binarization
def createDictionaryOfBinaryHistories(feature, conditionA, conditionB, data):
    histories = {}
    for index in data.index:
        current_raw_history = getRawHistory(index, feature, data)
        current_bin_history = binarizeHistory(current_raw_history, conditionA, conditionB)
        current_history = BinaryHistory(index, feature, current_bin_history)
        histories[index] = current_history
    return histories

# Get a list of all the individuals that have a certain type of binary history
def getIndividualsWithStructure(structure, histories):
    list_of_indeces = []
    for index in histories:
        if histories[index].historyStructureEqualsTo(structure):
            list_of_indeces.append(1)
        else:
            list_of_indeces.append(0)
    return list_of_indeces
    
    
# Add a list of structures of binary histories as features in the database
def addBHListOfStructuresAsFeatures(list_of_structures, histories, data):
    new_data = data.copy()
    for structure in list_of_structures:
        new_data = addBHStructureAsFeature(structure, histories, new_data)
    return new_data
    
# Add a structure of a binary history as a feature in the database
def addBHStructureAsFeature(structure, histories, data):
    new_data = data.copy()
    list_of_individuals = getIndividualsWithStructure(structure, histories)
    new_data.insert(new_data.shape[1], structure, list_of_individuals, True)
    return new_data
    
# Test cases
dicthist = createDictionaryOfBinaryHistories('estres', between(3, 6), lessThan(3), fdata)
#for id in dicthist:
#    print dicthist[id].structure

#getIndividualsWithStructure('AAAAAA', dicthist)
#ndata = addBHStructureAsFeature('AAAAAA', dicthist, fdata)
ndata = addBHListOfStructuresAsFeatures(['AAAAAA', '******', 'BBBBBB', '****BB', '****AA'], dicthist, fdata)

In [40]:
ndata.head()

Unnamed: 0_level_0,id_sexo,Aedad,AAedad,Apuesto,id_gestud,AIMC,fuma,fuma_act,ejer_act,ejer1,...,rest1,rest5,rest10,rest20,rest30,AAAAAA,******,BBBBBB,****BB,****AA
dp_folio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,F,51,6,Admin,CarTec,4,1,3.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0,1,0,0,1
2,F,38,4,Sec,Bach,3,2,,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0,1,0,1,0
3,F,34,3,Int,Sec,5,1,1.0,0.0,0.0,...,0.0,0.0,,,,0,1,0,0,1
4,M,63,8,Jef,CarTec,4,2,,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0,1,0,0,1
5,M,42,4,EM,Sec,3,1,2.0,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0,1,0,0,0


In [41]:
# Query all those individuals that have a certain history
ndata[ndata.Obesidad == 1]

AttributeError: 'DataFrame' object has no attribute 'Obesidad'

In [12]:
dicthist = createDictionaryOfBinaryHistories('condi', between(3, 6), lessThan(3), fdata)
histories_to_explore = ['AAAAAA', '******', 'BBBBBB', '****BB', '****AA', '***AAA', '***BBB']
ndata = addBHListOfStructuresAsFeatures(histories_to_explore, dicthist, fdata)
getEpsilon('****AA', 1, 'Obesidad', 0, ndata)

{'epsilon': 3.7851567081924604, 'nc': 848, 'ncx': 662, 'nx': 785}

In [26]:
getCondProb("Obesidad", 1, "****AA", 0, ndata)

0.36082474226804123