- For my final project, I will be analyzing how the number of certain cancer-risk SNPs correlate with specific ethnic groups.
- My specific research question is as followed: `Are the genotypic frequencies of cancer-risk SNPs different in certain ethnic groups compared to others?`


My analysis is follows:
- For each ethnic group, I will calculate the frequency of homozygous recessive (0|0),
homozygous dominate (1|1), and heterozygous (0|1, 1|0) for the cancer-risk SNPs.
- Once the frequency and percentages are calculated for each group, I will perform a
statistical test to determine if the differences in SNPs frequency between ethnic groups
statistically significant.
- If there is time, based on the genotypic frequencies, I will calculate if certain ethnic
groups are at a greater risk to developing certain cancers compared to other groups.

In [1]:
import csv

# Get Human IDs to Gene Frequencies

In [2]:
def getGeneData_and_HumanIDs(data_File):
    # Open the CSV file with the open() function
    # This returns a file object that you can use to read the file
    with open(data_File) as csvfile:
        # Use the csv.reader() function to read the file
        # This returns an iterable object that you can loop over
        data = csv.reader(csvfile)

        rawRowDataAsList = []

        # Loop over the rows in the file
        for row in data:
            rawRowDataAsList.append(row)

        # GENE DATA
        rawParsedGeneData = rawRowDataAsList[20][0].split("\t")
        geneData = rawParsedGeneData[9: len(rawParsedGeneData)-1]

        # HUMAN IDs 
        rawHumanIDs = rawRowDataAsList[19][0].split("\t")
        # print("rawHumanIDsParsed\n",rawHumanIDsParsed)

        humanIDs = rawHumanIDs[9: len(rawHumanIDs)-1]
        # print("humanIDs\n", humanIDs)
        # print(len(humanIDs))

    return (humanIDs, geneData)



In [3]:
humanIDs, geneData = getGeneData_and_HumanIDs('rawData/rs4713266.csv')

In [4]:
def get_humanID_to_genotype(humanIDs, geneData):
    humanID_to_genotype = {}
    
    for i in range(0, len(geneData)):

        if geneData[i] == "0|0":
            humanID_to_genotype[humanIDs[i]] = 0
        elif geneData[i] == "1|0" or geneData[i] == "0|1":
            humanID_to_genotype[humanIDs[i]] = 1
        elif geneData[i] == "1|1":
            humanID_to_genotype[humanIDs[i]] = 2

    return humanID_to_genotype


In [5]:
humanID_to_genotype = get_humanID_to_genotype(humanIDs, geneData)
# print(humanID_to_genotype)

In [6]:
def get_humanID_to_PopulationCode(data_File):
    # Open the CSV file with the open() function
    # This returns a file object that you can use to read the file
    with open(data_File) as csvfile:
        # Use the csv.reader() function to read the file
        # This returns an iterable object that you can loop over
        data = csv.reader(csvfile)

        rawRowDataAsList = []

        # Loop over the rows in the file
        for row in data:
            rawRowDataAsList.append(row)

        humanIDInfoHeader = rawRowDataAsList[0][0].split("\t")
        # print("humanIDInfoHeader\n", humanIDInfoHeader)
        del rawRowDataAsList[0]

        humanInfoParsed = []

        for rawHumanInfo in rawRowDataAsList:
            singleHumanInfoParsed = rawHumanInfo[0].split("\t")
            humanInfoParsed.append(singleHumanInfoParsed)
        # print("humanInfoParsed\n", humanInfoParsed)

        humanID_to_PopulationCode = {}

        for humanInfo in humanInfoParsed:
            humanID = humanInfo[0]
            populationCode = humanInfo[3]
            # humanID_and_PopulationCode.append([humanID, populationCode])
            humanID_to_PopulationCode[humanID] = populationCode

        # print("humanID_and_PopulationCode\n", humanID_to_PopulationCode)

    return humanID_to_PopulationCode



In [7]:
humanID_to_PopulationCode = get_humanID_to_PopulationCode('rawData/igsr_samples.tsv')
# print(humanID_to_PopulationCode)

# Merge Population Codes with Gene Frequencies at Corresponding Human IDs

In [8]:
def get_humanID_to_PopCode_Genotype(humanID_to_genotype, humanID_to_PopulationCode):

    humanID_to_PopCode_Genotype = {}

    for key in humanID_to_genotype.keys():
        # print("Ran outerloop...", key)
        if key in humanID_to_PopulationCode.keys():
            # print("Ran innerloop...", key)
            # print(humanID_to_PopulationCode.get(key), humanID_to_geneFrequency.get(key))
            humanID_to_PopCode_Genotype[key] = [humanID_to_PopulationCode.get(key), humanID_to_genotype.get(key)]

    # print("populationCode_to_geneFrequency\n", populationCode_to_geneFrequency)

    return humanID_to_PopCode_Genotype

In [9]:
humanID_to_PopCode_Genotype = get_humanID_to_PopCode_Genotype(humanID_to_genotype, humanID_to_PopulationCode)
print(humanID_to_PopCode_Genotype)

{'HG00096': ['GBR', 0], 'HG00097': ['GBR', 1], 'HG00099': ['GBR', 0], 'HG00100': ['GBR', 1], 'HG00101': ['GBR', 2], 'HG00102': ['GBR', 0], 'HG00103': ['GBR', 2], 'HG00104': ['GBR', 1], 'HG00105': ['GBR', 1], 'HG00106': ['GBR', 1], 'HG00107': ['GBR', 1], 'HG00108': ['GBR', 2], 'HG00109': ['GBR', 2], 'HG00110': ['GBR', 2], 'HG00111': ['GBR', 2], 'HG00112': ['GBR', 1], 'HG00113': ['GBR', 1], 'HG00114': ['GBR', 1], 'HG00115': ['GBR', 0], 'HG00116': ['GBR', 1], 'HG00117': ['GBR', 1], 'HG00118': ['GBR', 1], 'HG00119': ['GBR', 1], 'HG00120': ['GBR', 1], 'HG00121': ['GBR', 1], 'HG00122': ['GBR', 1], 'HG00123': ['GBR', 1], 'HG00125': ['GBR', 0], 'HG00126': ['GBR', 0], 'HG00127': ['GBR', 1], 'HG00128': ['GBR', 2], 'HG00129': ['GBR', 1], 'HG00130': ['GBR', 0], 'HG00131': ['GBR', 2], 'HG00132': ['GBR', 1], 'HG00133': ['GBR', 0], 'HG00134': ['GBR', 1], 'HG00135': ['GBR', 1], 'HG00136': ['GBR', 1], 'HG00137': ['GBR', 0], 'HG00138': ['GBR', 2], 'HG00139': ['GBR', 1], 'HG00140': ['GBR', 1], 'HG00141':

In [10]:
def get_populationCode_totalGeneFrequencies(humanID_to_PopCode_Genotype):
    populationCode_to_geneFrequencies = {}

    for value in humanID_to_PopCode_Genotype.values():

        population_Code = value[0]
        genoType = value[1]

        if genoType == 0:
            genoTypeList = [1, 0, 0]
        elif genoType == 1:
            genoTypeList = [0, 1, 0]
        elif genoType == 2:
            genoTypeList = [0, 0, 1]

        if population_Code not in populationCode_to_geneFrequencies:
            populationCode_to_geneFrequencies[population_Code] = genoTypeList

        elif population_Code in populationCode_to_geneFrequencies:

            currentGenoTypeList = populationCode_to_geneFrequencies.get(population_Code)

            # Use a list comprehension to add the elements from the two lists together
            totalGenoTypeList = [x + y for x, y in zip(currentGenoTypeList, genoTypeList)]

            populationCode_to_geneFrequencies[population_Code] = totalGenoTypeList

    print("populationCode_to_geneFrequencies\n", populationCode_to_geneFrequencies)

    return populationCode_to_geneFrequencies


In [11]:
populationCode_to_geneFrequencies = get_populationCode_totalGeneFrequencies(humanID_to_PopCode_Genotype)

populationCode_to_geneFrequencies
 {'GBR': [27, 53, 20], 'FIN': [39, 43, 23], 'CHS': [6, 34, 65], 'PUR': [21, 60, 23], 'CDX': [10, 47, 43], 'CLM': [19, 50, 26], 'IBS': [27, 55, 25], 'PEL': [7, 38, 40], 'PJL': [16, 48, 32], 'KHV': [6, 34, 59], 'ACB': [61, 34, 2], 'GWD': [82, 26, 5], 'ESN': [73, 25, 2], 'BEB': [9, 31, 46], 'MSL': [69, 19, 2], 'STU': [12, 44, 46], 'ITU': [9, 54, 39], 'CEU': [21, 61, 17], 'YRI': [73, 33, 1], 'CHB': [7, 29, 70], 'JPT': [5, 36, 64], 'LWK': [83, 18, 2], 'ASW': [30, 27, 4], 'MXL': [14, 26, 24], 'TSI': [21, 53, 37], 'GIH': [17, 53, 35]}


In [12]:
def total(populationCode_to_geneFrequencies):
    totalList = []
    for value in populationCode_to_geneFrequencies.values():
        total = value[0] + value[1] + value[2]
        totalList.append(total)

    print(totalList)
    return totalList

In [13]:
totalGeneFrequencies = total(populationCode_to_geneFrequencies)

[100, 105, 105, 104, 100, 95, 107, 85, 96, 99, 97, 113, 100, 86, 90, 102, 102, 99, 107, 106, 105, 103, 61, 64, 111, 105]


In [14]:
def get_populationCode_to_genePercentages(populationCode_to_geneFrequencies, totalGeneFrequencies):

    populationCode_to_genePercentages = {}

    cnt = 0
    # print(totalGeneFrequencies[int(cnt)])
    for key, value in populationCode_to_geneFrequencies.items():
        totalDivider = totalGeneFrequencies[int(cnt)]
        cnt += 1
        homozygousRecessive = value[0]/totalDivider
        heterozygous = value[1]/totalDivider
        homozygousDominate = value[2]/totalDivider

        populationCode_to_genePercentages[key] = [homozygousRecessive, heterozygous, homozygousDominate]

    return populationCode_to_genePercentages


In [15]:
populationCode_to_genePercentages = get_populationCode_to_genePercentages(populationCode_to_geneFrequencies, totalGeneFrequencies)

print(populationCode_to_genePercentages)

{'GBR': [0.27, 0.53, 0.2], 'FIN': [0.37142857142857144, 0.4095238095238095, 0.21904761904761905], 'CHS': [0.05714285714285714, 0.3238095238095238, 0.6190476190476191], 'PUR': [0.20192307692307693, 0.5769230769230769, 0.22115384615384615], 'CDX': [0.1, 0.47, 0.43], 'CLM': [0.2, 0.5263157894736842, 0.2736842105263158], 'IBS': [0.2523364485981308, 0.514018691588785, 0.2336448598130841], 'PEL': [0.08235294117647059, 0.4470588235294118, 0.47058823529411764], 'PJL': [0.16666666666666666, 0.5, 0.3333333333333333], 'KHV': [0.06060606060606061, 0.3434343434343434, 0.5959595959595959], 'ACB': [0.6288659793814433, 0.35051546391752575, 0.020618556701030927], 'GWD': [0.7256637168141593, 0.23008849557522124, 0.04424778761061947], 'ESN': [0.73, 0.25, 0.02], 'BEB': [0.10465116279069768, 0.36046511627906974, 0.5348837209302325], 'MSL': [0.7666666666666667, 0.2111111111111111, 0.022222222222222223], 'STU': [0.11764705882352941, 0.43137254901960786, 0.45098039215686275], 'ITU': [0.08823529411764706, 0.52

In [16]:
def print_populationCode_to_genePercentages(populationCode_to_genePercentages):
    print("PopCode --  homozygousRecessive, heterozygous, homozygousDominate")
    for key, value in populationCode_to_genePercentages.items():
        print(key, " ----- ", format(value[0], '.3f'), format(value[1], '.3f'), format(value[2], '.3f'))

In [17]:
print_populationCode_to_genePercentages(populationCode_to_genePercentages)

PopCode --  homozygousRecessive, heterozygous, homozygousDominate
GBR  -----  0.270 0.530 0.200
FIN  -----  0.371 0.410 0.219
CHS  -----  0.057 0.324 0.619
PUR  -----  0.202 0.577 0.221
CDX  -----  0.100 0.470 0.430
CLM  -----  0.200 0.526 0.274
IBS  -----  0.252 0.514 0.234
PEL  -----  0.082 0.447 0.471
PJL  -----  0.167 0.500 0.333
KHV  -----  0.061 0.343 0.596
ACB  -----  0.629 0.351 0.021
GWD  -----  0.726 0.230 0.044
ESN  -----  0.730 0.250 0.020
BEB  -----  0.105 0.360 0.535
MSL  -----  0.767 0.211 0.022
STU  -----  0.118 0.431 0.451
ITU  -----  0.088 0.529 0.382
CEU  -----  0.212 0.616 0.172
YRI  -----  0.682 0.308 0.009
CHB  -----  0.066 0.274 0.660
JPT  -----  0.048 0.343 0.610
LWK  -----  0.806 0.175 0.019
ASW  -----  0.492 0.443 0.066
MXL  -----  0.219 0.406 0.375
TSI  -----  0.189 0.477 0.333
GIH  -----  0.162 0.505 0.333


# Conduct statistical tests on population code to GeneFrequencies 

- test with individuals
- you want counts, not frequencies
- test with continental groups 
- use chi square test 

In [29]:
from scipy.stats import chisquare

# define your dictionary
data = {'apple': 10, 'banana': 20, 'orange': 30, 'pear': 40}

# calculate the chi-square test statistic and p-value
chi_square, p_value = chisquare(list(populationCode_to_geneFrequencies.values()))

# print the results
print(chi_square, p_value)


[585.61780105 103.24054316 396.0106383 ] [1.20654579e-107 1.77563521e-011 2.03603997e-068]
