- For my final project, I will be analyzing how the number of certain cancer-risk SNPs correlate with specific ethnic groups.
- My specific research question is as followed: `Are the genotypic frequencies of cancer-risk SNPs different in certain ethnic groups compared to others?`


My analysis is follows:
- For each ethnic group, I will calculate the frequency of homozygous recessive (0|0),
homozygous dominate (1|1), and heterozygous (0|1, 1|0) for the cancer-risk SNPs.
- Once the frequency and percentages are calculated for each group, I will perform a
statistical test to determine if the differences in SNPs frequency between ethnic groups
statistically significant.
- If there is time, based on the genotypic frequencies, I will calculate if certain ethnic
groups are at a greater risk to developing certain cancers compared to other groups.

In [1]:
import csv

# Get Human IDs to Gene Frequencies

In [2]:
def getGeneData_and_HumanIDs(data_File):
    # Open the CSV file with the open() function
    # This returns a file object that you can use to read the file
    with open(data_File) as csvfile:
        # Use the csv.reader() function to read the file
        # This returns an iterable object that you can loop over
        data = csv.reader(csvfile)

        rawRowDataAsList = []

        # Loop over the rows in the file
        for row in data:
            rawRowDataAsList.append(row)

        # HUMAN IDs
        rawHumanIDs = rawRowDataAsList[19]
        # print("rawHumanIDs\n", rawHumanIDs)

        rawHumanIDsParsed = rawHumanIDs[0].split("\t")
        # print("rawHumanIDsParsed\n",rawHumanIDsParsed)

        humanIDs = rawHumanIDsParsed[9: len(rawHumanIDsParsed)-1]
        # print("humanIDs\n", humanIDs)
        # print(len(humanIDs))

        # GENE DATA
        rawGeneData = rawRowDataAsList[20: len(rawRowDataAsList)-1]
        # print("rawGeneData\n", rawGeneData)
        # print(len(rawGeneData))

        geneData = [] 
        for rawGeneRow in rawGeneData:
            rawGeneRowParsed = rawGeneRow[0].split("\t")
            geneData.append(rawGeneRowParsed[9:len(rawGeneRowParsed)-1])
        # print("geneData\n", geneData)
        # print(len(geneData))
        # print(len(geneData[0]))

    return (humanIDs, geneData)



In [3]:
humanIDs, geneData = getGeneData_and_HumanIDs('testData/rs4713266.csv')

In [23]:
# print(humanIDs)
# print(geneData)

In [24]:
def getListOfGeneDataFrequencies(geneData):
    geneDataFrequencies = []

    print(len(geneData[0]))

    for i in range(0, len(geneData[0])):
        
        homozygousReccessive = 0
        heterozygous = 0
        homozygousDominate = 0
        geneSetFrequency = []
        
        for j in range(0, len(geneData)):

            if geneData[j][i] == "0|0":
                homozygousReccessive += 1
            elif geneData[j][i] == "1|0" or geneData[j][i] == "0|1":
                heterozygous += 1
            elif geneData[j][i] == "1|1":
                homozygousDominate += 1

        geneSetFrequency.append(homozygousReccessive)
        geneSetFrequency.append(heterozygous)
        geneSetFrequency.append(homozygousDominate)

        geneDataFrequencies.append(geneSetFrequency)

    # print(geneDataFrequencies)
    print(len(geneDataFrequencies))

    return geneDataFrequencies


In [25]:
def createDictOf_humanID_to_geneFrequency(humanIDs, geneDataFrequencies):
    humanID_to_geneFrequency = {}

    for i in range(0, len(humanIDs)):
        humanID_to_geneFrequency[humanIDs[i]] = geneDataFrequencies[i]

    # print(humanID_to_geneFrequency)

    return humanID_to_geneFrequency

In [26]:
def getDictof_humanID_to_PopulationCode(data_File):
    # Open the CSV file with the open() function
    # This returns a file object that you can use to read the file
    with open(data_File) as csvfile:
        # Use the csv.reader() function to read the file
        # This returns an iterable object that you can loop over
        data = csv.reader(csvfile)

        rawRowDataAsList = []

        # Loop over the rows in the file
        for row in data:
            rawRowDataAsList.append(row)

        humanIDInfoHeader = rawRowDataAsList[0][0].split("\t")
        print("humanIDInfoHeader\n", humanIDInfoHeader)
        del rawRowDataAsList[0]

        humanInfoParsed = []

        for rawHumanInfo in rawRowDataAsList:
            singleHumanInfoParsed = rawHumanInfo[0].split("\t")
            humanInfoParsed.append(singleHumanInfoParsed)
        # print("humanInfoParsed\n", humanInfoParsed)

        humanID_to_PopulationCode = {}

        for humanInfo in humanInfoParsed:
            humanID = humanInfo[0]
            populationCode = humanInfo[3]
            # humanID_and_PopulationCode.append([humanID, populationCode])
            humanID_to_PopulationCode[humanID] = populationCode

        # print("humanID_and_PopulationCode\n", humanID_to_PopulationCode)

    return humanID_to_PopulationCode



In [27]:
geneDataFrequencies = getListOfGeneDataFrequencies(geneData)

2547
2547


In [28]:
humanID_to_geneFrequency = createDictOf_humanID_to_geneFrequency(humanIDs, geneDataFrequencies)

In [29]:
# print(humanID_to_geneFrequency)

In [30]:
humanID_to_PopulationCode = getDictof_humanID_to_PopulationCode('rawData/igsr_samples.tsv')

humanIDInfoHeader
 ['Sample name', 'Sex', 'Biosample ID', 'Population code', 'Population name', 'Superpopulation code', 'Superpopulation name', 'Population elastic ID', 'Data collections']


In [31]:
# print(humanID_to_PopulationCode)

# Merge Population Codes with Gene Frequencies at Corresponding Human IDs

In [32]:
def get_populationCode_to_geneFrequency(humanID_to_geneFrequency, humanID_to_PopulationCode):

    populationCode_to_geneFrequency = {}

    for key in humanID_to_geneFrequency.keys():
        # print("Ran outerloop...", key)
        if key in humanID_to_PopulationCode.keys():
            # print("Ran innerloop...", key)
            # print(humanID_to_PopulationCode.get(key), humanID_to_geneFrequency.get(key))
            populationCode_to_geneFrequency[key] = [humanID_to_PopulationCode.get(key), humanID_to_geneFrequency.get(key)]

    # print("populationCode_to_geneFrequency\n", populationCode_to_geneFrequency)

    return populationCode_to_geneFrequency

In [33]:
populationCode_to_geneFrequency = get_populationCode_to_geneFrequency(humanID_to_geneFrequency, humanID_to_PopulationCode)

In [34]:
def get_populationCode_totalGeneFrequencies(populationCode_to_geneFrequency):
    populationCode_totalGeneFrequencies = {}

    for value in populationCode_to_geneFrequency.values():
        # print(value)
        population_Code = value[0]
        geneFrequencies = value[1]

        if population_Code not in populationCode_totalGeneFrequencies:
            populationCode_totalGeneFrequencies[population_Code] = geneFrequencies

        elif population_Code in populationCode_totalGeneFrequencies:

            addHomozygousRecessiveTotal = geneFrequencies[0]
            addHeterozygousTotal = geneFrequencies[1]
            addHomozygousDominateTotal = geneFrequencies[2]

            currentHomozygousRecessiveTotal = populationCode_totalGeneFrequencies.get(population_Code)[0]
            currentHeterozygousTotal = populationCode_totalGeneFrequencies.get(population_Code)[1]
            currentHomozygousDominateTotal = populationCode_totalGeneFrequencies.get(population_Code)[2]

            HomozygousRecessiveTotal = addHomozygousRecessiveTotal + currentHomozygousRecessiveTotal
            HeterozygousTotal = addHeterozygousTotal + currentHeterozygousTotal
            HomozygousDominateTotal = addHomozygousDominateTotal + currentHomozygousDominateTotal

            populationCode_totalGeneFrequencies[population_Code] = [HomozygousRecessiveTotal, HeterozygousTotal, HomozygousDominateTotal]

    print("populationCode_totalGeneFrequencies\n", populationCode_totalGeneFrequencies)

    return populationCode_totalGeneFrequencies


In [35]:
populationCode_totalGenoFrequencies = get_populationCode_totalGeneFrequencies(populationCode_to_geneFrequency)

populationCode_totalGeneFrequencies
 {'GBR': [727, 53, 20], 'FIN': [773, 44, 23], 'CHS': [741, 34, 65], 'PUR': [747, 62, 23], 'CDX': [710, 47, 43], 'CLM': [682, 52, 26], 'IBS': [775, 56, 25], 'PEL': [602, 38, 40], 'PJL': [688, 48, 32], 'KHV': [699, 34, 59], 'ACB': [717, 56, 3], 'GWD': [848, 49, 7], 'ESN': [755, 42, 3], 'BEB': [611, 31, 46], 'MSL': [687, 30, 3], 'STU': [724, 46, 46], 'ITU': [722, 55, 39], 'CEU': [713, 62, 17], 'YRI': [793, 62, 1], 'CHB': [748, 30, 70], 'JPT': [740, 36, 64], 'LWK': [790, 32, 2], 'ASW': [452, 32, 4], 'MXL': [461, 27, 24], 'TSI': [796, 55, 37], 'GIH': [752, 53, 35]}


In [36]:
def get_populationCode_to_totalGene(populationCode_totalGenoFrequencies):
    populationCode_to_totalGene = {}

    for key, value in populationCode_totalGenoFrequencies.items():
        populationCode_to_totalGene[key] = value[0] + value[1] + value[2]

    print("populationCode_to_totalGeno\n", populationCode_to_totalGene)

    return populationCode_to_totalGene

In [37]:
populationCode_to_totalGeno = get_populationCode_to_totalGene(populationCode_totalGenoFrequencies)

populationCode_to_totalGeno
 {'GBR': 800, 'FIN': 840, 'CHS': 840, 'PUR': 832, 'CDX': 800, 'CLM': 760, 'IBS': 856, 'PEL': 680, 'PJL': 768, 'KHV': 792, 'ACB': 776, 'GWD': 904, 'ESN': 800, 'BEB': 688, 'MSL': 720, 'STU': 816, 'ITU': 816, 'CEU': 792, 'YRI': 856, 'CHB': 848, 'JPT': 840, 'LWK': 824, 'ASW': 488, 'MXL': 512, 'TSI': 888, 'GIH': 840}


In [38]:
def get_populationCode_to_genePercentages(populationCode_totalGenoFrequencies, populationCode_to_totalGeno):
    populationCode_to_genePercentages = {}

    for key, value in populationCode_totalGenoFrequencies.items():
        homozygousRecessive = value[0]/populationCode_to_totalGeno.get(key)
        heterozygous = value[1]/populationCode_to_totalGeno.get(key)
        homozygousDominate = value[2]/populationCode_to_totalGeno.get(key)

        populationCode_to_genePercentages[key] = [homozygousRecessive, heterozygous, homozygousDominate]

    print("populationCode_to_genePercentages\n", populationCode_to_genePercentages)

    return populationCode_to_genePercentages

In [39]:
populationCode_to_genePercentages = get_populationCode_to_genePercentages(populationCode_totalGenoFrequencies, populationCode_to_totalGeno)

populationCode_to_genePercentages
 {'GBR': [0.90875, 0.06625, 0.025], 'FIN': [0.9202380952380952, 0.05238095238095238, 0.02738095238095238], 'CHS': [0.8821428571428571, 0.04047619047619048, 0.07738095238095238], 'PUR': [0.8978365384615384, 0.07451923076923077, 0.027644230769230768], 'CDX': [0.8875, 0.05875, 0.05375], 'CLM': [0.8973684210526316, 0.06842105263157895, 0.034210526315789476], 'IBS': [0.905373831775701, 0.06542056074766354, 0.029205607476635514], 'PEL': [0.8852941176470588, 0.05588235294117647, 0.058823529411764705], 'PJL': [0.8958333333333334, 0.0625, 0.041666666666666664], 'KHV': [0.8825757575757576, 0.04292929292929293, 0.07449494949494949], 'ACB': [0.9239690721649485, 0.07216494845360824, 0.003865979381443299], 'GWD': [0.9380530973451328, 0.05420353982300885, 0.007743362831858407], 'ESN': [0.94375, 0.0525, 0.00375], 'BEB': [0.8880813953488372, 0.04505813953488372, 0.06686046511627906], 'MSL': [0.9541666666666667, 0.041666666666666664, 0.004166666666666667], 'STU': [0.887

In [40]:
def print_populationCode_to_genePercentages(populationCode_to_genePercentages):
    print("PopCode --  homozygousRecessive, heterozygous, homozygousDominate")

    for key, value in populationCode_to_genePercentages.items():
        print(key, " ----- ", format(value[0], '.3f'), format(value[1], '.3f'), format(value[2], '.3f'))

In [41]:
print_populationCode_to_genePercentages(populationCode_to_genePercentages)

PopCode --  homozygousRecessive, heterozygous, homozygousDominate
GBR  -----  0.909 0.066 0.025
FIN  -----  0.920 0.052 0.027
CHS  -----  0.882 0.040 0.077
PUR  -----  0.898 0.075 0.028
CDX  -----  0.887 0.059 0.054
CLM  -----  0.897 0.068 0.034
IBS  -----  0.905 0.065 0.029
PEL  -----  0.885 0.056 0.059
PJL  -----  0.896 0.062 0.042
KHV  -----  0.883 0.043 0.074
ACB  -----  0.924 0.072 0.004
GWD  -----  0.938 0.054 0.008
ESN  -----  0.944 0.052 0.004
BEB  -----  0.888 0.045 0.067
MSL  -----  0.954 0.042 0.004
STU  -----  0.887 0.056 0.056
ITU  -----  0.885 0.067 0.048
CEU  -----  0.900 0.078 0.021
YRI  -----  0.926 0.072 0.001
CHB  -----  0.882 0.035 0.083
JPT  -----  0.881 0.043 0.076
LWK  -----  0.959 0.039 0.002
ASW  -----  0.926 0.066 0.008
MXL  -----  0.900 0.053 0.047
TSI  -----  0.896 0.062 0.042
GIH  -----  0.895 0.063 0.042


# Conduct statistical tests on population code to GeneFrequencies 

- test with individuals
- you want counts, not frequencies
- test with continental groups 
- use chi square test 