# Hapmap Generator
## By Samuel Horovatin, s.horovatin@usask.ca

A simplistic hapmap generator. Follows the format outlined here: http://augustogarcia.me/statgen-esalq/Hapmap-and-VCF-formats-and-its-integration-with-onemap/

In [30]:
import os, sys
import pandas as pd
import numpy

# Change to path of unprocessed hapmap. Format should be: Index, Name, Traits.....
RAWHAP = "./hapmaps/wheat_hapmap_new.txt"

# Change to path of 90K summary. Found mine at: :https://urgi.versailles.inra.fr/download/iwgsc/IWGSC_RefSeq_Annotations/v1.0/, zip file: iwgsc_refseqv1.0_Marker_mapping_summary_2017Mar13/infinium90K.summary.gff)
# Made a slight edit to the raw summary to add headers (chrom	1	2	pos1	pos2	3	strand	4	other)
SUMMARY90K = "./hapmaps/infinium90K.summary.gff"

# Change to output file location/name
OUTPUT = "./hapmaps/wheat_hapmap_gen.txt"

# Column headers used within the fromated hapmap
COLHEADERS = ['rs#','alleles','chrom','pos','strand','assembly#','center', 'protLSID', 'assayLSID', 'panelLSID', 'QCcode']

# Column headers for none SNP columns used in the RAWHAP
SUMMARYHEADERS = ['Index', 'Name']

In [31]:
# Load in the relevant data
raw_hap_df = pd.read_csv(RAWHAP, sep='\t')
summary_90k_df = pd.read_csv(SUMMARY90K, sep='\t')

In [32]:
# Splits other column into distinct columns and does a touch of trimming
summary_90k_df[['ID', 'Name', 'coverage', 'identity']] = summary_90k_df['other'].str.split(';',expand=True) 
summary_90k_df['chrom'] = summary_90k_df['chrom'].map(lambda x: x.replace('chr', ''))
summary_90k_df['ID'] = summary_90k_df['ID'].map(lambda x: x.replace('ID=', ''))
summary_90k_df['Name'] = summary_90k_df['Name'].map(lambda x: x.replace('Name=', ''))
summary_90k_df['coverage'] = summary_90k_df['coverage'].map(lambda x: x.replace('coverage=', ''))
summary_90k_df['identity'] = summary_90k_df['identity'].map(lambda x: x.replace('identity=', ''))

In [46]:
def Allele_Gen(df, col_headers, index):
    
    df_alleles = pd.DataFrame(df[index])
    df_data = df.loc[:, ~df.columns.isin(col_headers)] # removes all none SNP columns
    df_data_list = df_data.values.tolist()
    alleles = []
    for row in df_data_list:
        alleles.append("/".join(set(''.join(row).replace('-', '')))) 
    df_alleles['alleles'] = alleles
    return pd.DataFrame(df_alleles)
    
print(Allele_Gen(raw_hap_df, SUMMARYHEADERS, 'Name'))

                             Name alleles
0                   BS00011231_51     A/G
1                   BS00030571_51     A/G
2                   BS00033750_51     A/G
3                   Ex_c6145_2193     A/G
4             Excalibur_c3948_235       G
...                           ...     ...
7349  wsnp_RFL_Contig4207_4836784     A/G
7350                     IACX6482     A/T
7351                     IACX3386     A/T
7352                     IAAV5266     A/T
7353                     IACX6176     A/T

[7354 rows x 2 columns]


In [None]:
# 
gen_hap_df = pd.DataFrame(columns=COLHEADERS)
gen_hap_df['rs#'] = gen_hap_df['Name']

