# Imports and set up

In [1]:
import pandas as pd
import os
import glob

In [2]:
WORKING_DIR = '/home/projects/genomics'

os.chdir(WORKING_DIR)

### Install VEP using docker

You need docker to run this mnotebook! We need it to install VEP

# Get the data

### Copy just 10 files to try out

WARNING: this code uses bash and runs on linux only!

In [3]:
%%bash
rm -rf sample_summary_stats
mkdir sample_summary_stats

In [4]:
os.chdir('/home/projects/genomics/summary_stats')

In [5]:
! cp $(ls | head -10) /home/projects/genomics/sample

ls: write error: Broken pipe


In [6]:
os.chdir(WORKING_DIR)

### We need .h file

In [40]:
! ls sample

34662886-GCST90077560-EFO_0004326-Build38.f.tsv.gz
34662886-GCST90077560-EFO_0004326.h.tsv.gz
34662886-GCST90077569-EFO_0009817-Build38.f.tsv.gz
34662886-GCST90077569-EFO_0009817.h.tsv.gz
34662886-GCST90077570-EFO_0009817-Build38.f.tsv.gz
34662886-GCST90077570-EFO_0009817.h.tsv.gz
34662886-GCST90077571-EFO_0009817-Build38.f.tsv.gz
34662886-GCST90077571-EFO_0009817.h.tsv.gz
34662886-GCST90077572-EFO_0009817-Build38.f.tsv.gz
34662886-GCST90077572-EFO_0009817.h.tsv.gz


In [41]:
TSV_FILE_PATH = 'sample/34662886-GCST90077560-EFO_0004326.h.tsv.gz'

### Import to pandas

In [44]:
df = pd.read_csv(TSV_FILE_PATH, sep='\t', compression='gzip')

In [45]:
df.head()

Unnamed: 0,hm_variant_id,hm_rsid,hm_chrom,hm_pos,hm_other_allele,hm_effect_allele,hm_beta,hm_odds_ratio,hm_ci_lower,hm_ci_upper,...,other_allele,effect_allele,beta,ci_lower,ci_upper,p_value,effect_allele_frequency,standard_error,odds_ratio,variant_id
0,1_930158_C_T,rs1381099827,1,930158,C,T,-0.192142,,-0.905992,0.521708,...,C,T,-0.192142,-0.905992,0.521708,0.597811,9e-06,0.999991,,rs1381099827
1,1_930165_G_A,rs201186828,1,930165,G,A,-0.021754,,-0.33229,0.288782,...,G,A,-0.021754,-0.33229,0.288782,0.890793,4.5e-05,0.999955,,rs201186828
2,1_930204_G_A,rs148711625,1,930204,G,A,0.105264,,-0.210775,0.421303,...,G,A,0.105264,-0.210775,0.421303,0.513879,4.4e-05,0.999956,,rs148711625
3,1_930215_A_G,rs903331232,1,930215,A,G,0.081907,,-0.422869,0.586683,...,A,G,0.081907,-0.422869,0.586683,0.750462,1.7e-05,0.999983,,rs903331232
4,1_930245_G_A,rs146327803,1,930245,G,A,0.252748,,-0.192428,0.697924,...,G,A,0.252748,-0.192428,0.697924,0.265808,2.2e-05,0.999978,,rs146327803


In [46]:
cols_for_annotation = ['chromosome', 'base_pair_location', 'other_allele', 'effect_allele']
df_to_vep = df[cols_for_annotation].copy()
df_to_vep.head()

Unnamed: 0,chromosome,base_pair_location,other_allele,effect_allele
0,1,930158,C,T
1,1,930165,G,A
2,1,930204,G,A
3,1,930215,A,G
4,1,930245,G,A


In [47]:
df_to_vep['start'] = df_to_vep['base_pair_location']
df_to_vep['end'] = df_to_vep['base_pair_location']
del df_to_vep['base_pair_location']

In [48]:
def concat_alleles(row):
    return row['effect_allele'] + '/' + row['other_allele']

In [49]:
df_to_vep['allele'] = df_to_vep.apply(concat_alleles, axis='columns')

In [50]:
del df_to_vep['effect_allele']
del df_to_vep['other_allele']

In [51]:
df_to_vep.head()

Unnamed: 0,chromosome,start,end,allele
0,1,930158,930158,T/C
1,1,930165,930165,A/G
2,1,930204,930204,A/G
3,1,930215,930215,G/A
4,1,930245,930245,A/G
