# Hapmap Generator
## By Samuel Horovatin, s.horovatin@usask.ca

A simplistic hapmap generator. Follows the format outlined here: http://augustogarcia.me/statgen-esalq/Hapmap-and-VCF-formats-and-its-integration-with-onemap/

In [2]:
import os, sys
import pandas as pd
import numpy

# Change to path of unprocessed hapmap. Format should be: Index, Name, Traits.....
RAWHAP = "./hapmaps/wheat_hapmap_new.txt"

# Change to path of 90K summary. Found mine at: :https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Annotations/v1.0/, zip file: iwgsc_refseqv1.0_Marker_mapping_summary_2017Mar13/infinium90K.summary.gff)
# Made a slight edit to the raw summary to add headers (chrom	1	2	pos1	pos2	3	strand	4	other)
SUMMARY90K = "./hapmaps/infinium90K.summary.gff"

# Change to output file location/name
OUTPUT = "./hapmaps/wheat_hapmap_gen.txt"

# Column headers used within the fromated hapmap
COLHEADERS = ['rs#','alleles','chrom','pos','strand','assembly#','center', 'protLSID', 'assayLSID', 'panelLSID', 'QCcode']

# Column headers for none SNP columns used in the RAWHAP
SUMMARYHEADERS = ['Index', 'Name']

In [3]:
# Load in the relevant data
raw_hap_df = pd.read_csv(RAWHAP, sep='\t')
summary_90k_df = pd.read_csv(SUMMARY90K, sep='\t')

In [4]:
# Splits other column into distinct columns and does a touch of trimming
summary_90k_df[['ID', 'Name', 'coverage', 'identity']] = summary_90k_df['other'].str.split(';',expand=True) 
summary_90k_df['chrom'] = summary_90k_df['chrom'].map(lambda x: x.replace('chr', ''))
summary_90k_df['ID'] = summary_90k_df['ID'].map(lambda x: x.replace('ID=', ''))
summary_90k_df['Name'] = summary_90k_df['Name'].map(lambda x: x.replace('Name=', ''))
summary_90k_df['coverage'] = summary_90k_df['coverage'].map(lambda x: x.replace('coverage=', ''))
summary_90k_df['identity'] = summary_90k_df['identity'].map(lambda x: x.replace('identity=', ''))

In [10]:
# Generates allele options in format required for hapmap by finding all unique bases in row
# Slaps alleles in dataframe alleles_df
index_col = SUMMARYHEADERS[1]
alleles = []


alleles_df = pd.DataFrame(raw_hap_df[index_col])
rawhap_allele_data_df = raw_hap_df.loc[:, ~raw_hap_df.columns.isin(SUMMARYHEADERS)] # removes all none SNP columns
rawhap_allele_data_list = rawhap_allele_data_df.values.tolist()

for row in rawhap_allele_data_list:
    alleles.append("/".join(set(''.join(row).replace('-', '')))) 
alleles_df['alleles'] = alleles

    
print(alleles_df)

                             Name alleles
0                   BS00011231_51     G/A
1                   BS00030571_51     G/A
2                   BS00033750_51     G/A
3                   Ex_c6145_2193     G/A
4             Excalibur_c3948_235       G
...                           ...     ...
7349  wsnp_RFL_Contig4207_4836784     G/A
7350                     IACX6482     T/A
7351                     IACX3386     T/A
7352                     IAAV5266     T/A
7353                     IACX6176     T/A

[7354 rows x 2 columns]


In [39]:
# Generate new hapmap file


gen_hap_df = pd.DataFrame()
# For rs#
gen_hap_df[COLHEADERS[0]] = raw_hap_df['Name']
# For alleles
gen_hap_df = gen_hap_df.merge(alleles_df, left_on='rs#', right_on='Name')[COLHEADERS[0:2]]

summary_gen_merge_df = gen_hap_df.merge(summary_90k_df, left_on='rs#', right_on='Name') # ASSUMPTION: When creating this merge, I assume that the first common "rs#" <-> "Name" found for each row is used, as "Name" is not unique in summary_90k_df
# For chrom
gen_hap_df[COLHEADERS[2]] = summary_gen_merge_df[COLHEADERS[2]] 
# For pos
gen_hap_df[COLHEADERS[3]] = summary_gen_merge_df['pos1'] # Magic value 'pos1' comes from a email suggestion that this column contained relevant position info
# For strand
gen_hap_df[COLHEADERS[4]] = summary_gen_merge_df[COLHEADERS[4]]  
# For other columns not relevant to goal hapmap
gen_hap_df[COLHEADERS[5:len(COLHEADERS)]] = 'NA'

print(gen_hap_df)

                              rs# alleles chrom        pos strand assembly#  \
0                   BS00011231_51     G/A    3A  685357846      +        NA   
1                   BS00030571_51     G/A    4B  602404417      +        NA   
2                   BS00033750_51     G/A    Un  150797593      +        NA   
3                   Ex_c6145_2193     G/A    1D   12534470      -        NA   
4             Excalibur_c3948_235       G    5D   26791983      -        NA   
...                           ...     ...   ...        ...    ...       ...   
7349  wsnp_RFL_Contig4207_4836784     G/A    6D   62114895      -        NA   
7350                     IACX6482     T/A    6A   79268282      -        NA   
7351                     IACX3386     T/A    6A    5327273      -        NA   
7352                     IAAV5266     T/A    6D    5988099      -        NA   
7353                     IACX6176     T/A    5B  707131602      +        NA   

     center protLSID assayLSID panelLSID QCcode  
0