In [1]:
from helpers.getpaths import *
from helpers.ldlink import *

In [2]:
from helpers.ldlink import *

class Variant():
    def __init__(self, rsid:str, chrom:int, pos:int, EA:str, OA:str):
        self.rsid = rsid 
        self.chrom = chrom
        self.pos = pos
        self.EA = EA
        self.OA = OA
    
    def get_rsid(self):
        return self.rsid
    
    def get_fullpos(self):
        return self.chrom, self.pos
    
    def get_pos(self):
        return self.pos
    
    def get_chrom(self):
        return self.chrom
    
    def get_EA(self):
        return self.EA
    
    def get_OA(self):
        return self.OA
    
    def map_alleles(self, x, EA, OA):
        d = {}
        a1 = x.split(",")[0].split("=")
        a2 = x.split(",")[1].split("=")
        if a1[0]==EA:
            d[EA]=a1[1]
            d[OA]=a2[1]
        elif a1[0]==OA:
            d[OA] = a1[1]
            d[EA] = a2[1]
        return d
    
    def set_LDblock(self):
        df = ldproxy(self.rsid)
        df['EA'] = df.Correlated_Alleles.apply(lambda x:self.map_alleles(x,EA,OA)[EA])
        df['OA'] = df.Correlated_Alleles.apply(lambda x:self.map_alleles(x,EA,OA)[OA])
        df['chrom'] = df.Coord.apply(lambda x:int(x.split(":")[0][-1]))
        df['hg38_pos'] = df.Coord.apply(lambda x:int(x.split(":")[1]))
        df = df[['RS_Number', 'chrom', 'hg38_pos', 'EA','OA', 'R2', 'MAF']]
        self.LDblock = df
        
rsid = 'rs10103048'

chrom = 8
pos = 130602281
EA = 'A'
OA = 'C'

snp = Variant(rsid, chrom, pos, EA,OA)



In [3]:
paths = get_paths("/media/")

paths

{'eqtl_cat': '/media/cbio3/data/eQTL_DB/',
 'dbsnp': '/media/cbio3/data/dbSNP/',
 'eqtl_tokyo': '/media/cbio3/projects/Zain_2021/ImmuNEXT_Japan_Cell2021/eQTL_summarystats/',
 'ge_tokyo': '/media/cbio3/projects/Zain_2021/ImmunexUT_GE/E-GEAD-397.processed/tpm/',
 'eqtlgen_cis': '/media/cbio3/projects/Zain_2021/eQTLgen/data/2019-12-11-cis-eQTLsFDR0.05-ProbeLevel-CohortInfoRemoved-BonferroniAdded_sorted.txt ',
 'eqtlgen_trans': '/media/cbio3/projects/Zain_2021/eQTLgen/data/2018-09-04-trans-eQTLsFDR0.05-CohortInfoRemoved-BonferroniAdded.txt ',
 'tokyo_alleles': '/media/cbio3/projects/Zain_2021/ImmuNEXT_Japan_Cell2021/metadata/full_allele_defs.txt',
 'tokyo_ge': '/media/cbio3/projects/Zain_2021/ImmunexUT_GE/E-GEAD-397.processed/tpm/',
 'ensembl': '/media/cbio3/data/ensembl_biomart/gene_aliases.txt',
 'ucsc': '/media/cbio3/data/ensembl_biomart/hg38_genes_ucsc.txt'}

In [4]:
rsid = 'rs10103048'

In [4]:
# now each of these resources needs different methods for access

import os
import pandas as pd 
import numpy as np

# this is waiting some kind of tabix indexing of the file 

def get_eqtlgen_cis(snp_list, paths=paths):
    # need to call get_paths first to get eqtlgen_cis filepath
    
    # function that simply filters eQTLgen-cis to list of rsids
    
    x = pd.read_csv(paths['eqtlgen_cis'], sep='\t')
    
    pass


In [5]:
def get_tokyo_cis(snp_list,paths=paths):
    
    # loops through all the sumstats and pulls out all matches to query SNPs
    
    x = os.listdir(paths['eqtl_tokyo'])
    x = [paths['eqtl_tokyo'] + i for i in x]
    
    df_list = []
    
    for file in x:
        df = pd.read_csv(file, sep='\t')
        
        df = df[df.Variant_ID.isin(snp_list)]
        
        cell = file.split("/")[-1].split("_cond")[0]
        
        df['cell'] = cell
        
        df_list.append(df)
    return pd.concat(df_list)

In [6]:
# eqtl Catalogue - need to remap all my credible set summary files to proper gene and variant names using the ref files provided 
# downloading them now 

def get_eqtlcat_credset(snp_list, paths=paths):
    pass

all_files = os.listdir(paths['eqtl_cat'] + 'credible_sets')

meta = [i for i in os.listdir(paths['eqtl_cat']) if i.split(".")[-1]=="csv"]
meta = paths['eqtl_cat'] + meta[0]
meta = pd.read_csv(meta).Study.tolist()

files = []

for i in all_files:
    for m in meta:
        if m in i:
            files.append(paths['eqtl_cat']+'credible_sets/' + i)
            
x = pd.read_csv(files[0], sep='\t')
x.head()

Unnamed: 0,molecular_trait_id,variant,chromosome,position,ref,alt,cs_id,cs_index,finemapped_region,pip,z,cs_min_r2,cs_avg_r2,cs_size,posterior_mean,posterior_sd,cs_log10bf
0,ENSG00000188290.10_1_1000112_1000172,chr1_1000079_A_G,1,1000079,A,G,ENSG00000188290.10_1_1000112_1000172_L1,L1,chr1:142-2000142,0.037619,-5.673441,0.546196,0.790143,19,-0.028499,0.146938,4.039558
1,ENSG00000188290.10_1_1000112_1000172,chr1_1000112_G_T,1,1000112,G,T,ENSG00000188290.10_1_1000112_1000172_L1,L1,chr1:142-2000142,0.037339,-5.671394,0.546196,0.790143,19,-0.0283,0.146485,4.039558
2,ENSG00000188290.10_1_1000112_1000172,chr1_1000291_C_G,1,1000291,C,G,ENSG00000188290.10_1_1000112_1000172_L1,L1,chr1:142-2000142,0.084505,5.849436,0.546196,0.790143,19,0.07584,0.254031,4.039558
3,ENSG00000188290.10_1_1000112_1000172,chr1_1000453_C_G,1,1000453,C,G,ENSG00000188290.10_1_1000112_1000172_L1,L1,chr1:142-2000142,0.037265,-5.670987,0.546196,0.790143,19,-0.028206,0.146154,4.039558
4,ENSG00000188290.10_1_1000112_1000172,chr1_1000814_A_G,1,1000814,A,G,ENSG00000188290.10_1_1000112_1000172_L1,L1,chr1:142-2000142,0.039019,-5.683077,0.546196,0.790143,19,-0.029735,0.150412,4.039558


In [8]:
# gene expression lookups in ImmunexUT data

# multiple donors for every cell types

# extract list of numbers for each cell type. Can use this to construct boxplots, means, etc.


g = os.listdir(paths['ge_tokyo'])

g = [paths['ge_tokyo'] + i for i in g]

pd.read_csv(g[0], sep='\t')

Unnamed: 0,Gene_name,NW100766,NW101334,NW101409,NW102537,NW103721,NW103984,NW107509,NW108551,NW109068,...,NW189242,NW191139,NW191977,NW192678,NW192956,NW194176,NW197021,NW199501,NW199834,NW199907
0,DPM1,71.026893,85.991016,109.480961,69.982619,79.777627,66.310846,73.788173,67.735969,66.084273,...,100.272661,67.969278,66.091513,80.695705,68.617867,104.447285,74.442871,71.514384,60.559690,64.104525
1,SCYL3,10.497253,8.561123,9.890112,13.879714,12.345047,9.779591,14.080985,11.952519,14.041186,...,10.968706,12.112270,14.370151,8.736413,13.935659,8.617643,11.142311,15.275693,8.200183,11.187171
2,C1orf112,9.231251,6.386137,6.472582,9.386585,8.818284,6.084325,9.681862,9.659316,10.725835,...,9.563534,7.300959,10.581185,6.783008,10.602472,8.989581,9.064241,8.132379,6.853539,10.224853
3,FGR,1169.062819,1517.581161,1303.774565,1093.915984,1291.692031,1328.939290,1402.950905,1040.949908,1354.458503,...,1294.194750,1110.638627,1281.460222,1436.458546,1324.341811,1182.974232,1446.580401,1423.035437,1131.913962,1281.657631
4,CFH,0.000000,0.000000,0.000000,0.137726,0.417020,0.178547,0.000000,0.140630,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.138282,0.053769,0.000000,0.293720,0.000000,0.145480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53339,AL139254.3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
53340,AL591163.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
53341,AL589702.1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
53342,AL034417.4,0.413797,0.130119,0.074015,0.821797,1.066418,0.426147,0.940324,0.755207,0.500826,...,0.443912,0.584411,0.790514,0.633443,0.742598,0.705829,0.238073,0.525778,0.924488,0.347224


In [47]:
import gget

gget.info(["AL513220.1"], "homo_sapiens")



In [9]:
# remapping gene names to ensembl IDs for Gex lookups

def gene2ensembl(gene, paths=paths):
    
    # take common gene name and return ensembl ID
    # checks all synonyms as well
    
    # load ensembl mappings into dataframe 
    x = pd.read_csv(paths['ensembl'], sep='\t')
    
    x.columns = ['ensembl', 'version', 'name', 'synonym', 'HGNC']
    
    # if the gene name matches the HGNC name then there will be a string match
    # on the name column
    
    if len(x[x.name==gene])>0:
        x = set(x[x.name==gene].ensembl.tolist())
        x = list(x)
        assert len(x)==1
        return x[0]
    
    # otherwise we go to the synonyms column
    
    elif len(x[x.synonym==gene])>0:
        w = x[x.synonym==gene]
        
        assert len(w)==1
        return w.name.values[0]
    
def ensembl2gene(ensembl, paths=paths):
    x = pd.read_csv(paths['ensembl'], sep='\t')
    x.columns = ['ensembl', 'version', 'name', 'synonym', 'HGNC']
    x = x[x.ensembl==ensembl]
    
    x = set(x.name.tolist())
    x = list(x)
    assert len(x)==1
    
    return x[0]



In [42]:
y = pd.read_csv(paths['ucsc'], sep ='\t')
y

Unnamed: 0,HGNC ID,Approved symbol,Approved name,Previous symbol,Alias symbol,Ensembl gene ID,NCBI gene ID
0,HGNC:5,A1BG,alpha-1-B glycoprotein,,,ENSG00000121410,1.0
1,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,NCRNA00181,FLJ23569,ENSG00000268895,503538.0
2,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,A1BGAS,FLJ23569,ENSG00000268895,503538.0
3,HGNC:37133,A1BG-AS1,A1BG antisense RNA 1,A1BG-AS,FLJ23569,ENSG00000268895,503538.0
4,HGNC:24086,A1CF,APOBEC1 complementation factor,,ACF,ENSG00000148584,29974.0
...,...,...,...,...,...,...,...
72674,HGNC:29027,ZZEF1,zinc finger ZZ-type and EF-hand domain contain...,,KIAA0399,ENSG00000074755,23140.0
72675,HGNC:29027,ZZEF1,zinc finger ZZ-type and EF-hand domain contain...,,ZZZ4,ENSG00000074755,23140.0
72676,HGNC:29027,ZZEF1,zinc finger ZZ-type and EF-hand domain contain...,,FLJ10821,ENSG00000074755,23140.0
72677,HGNC:24523,ZZZ3,zinc finger ZZ-type containing 3,,DKFZP564I052,ENSG00000036549,26009.0


In [43]:
y[y['Ensembl gene ID']=='ENSG00000284747']

Unnamed: 0,HGNC ID,Approved symbol,Approved name,Previous symbol,Alias symbol,Ensembl gene ID,NCBI gene ID


In [44]:
y[y['Ensembl gene ID']=='ENSG00000148584']

Unnamed: 0,HGNC ID,Approved symbol,Approved name,Previous symbol,Alias symbol,Ensembl gene ID,NCBI gene ID
4,HGNC:24086,A1CF,APOBEC1 complementation factor,,ACF,ENSG00000148584,29974.0
5,HGNC:24086,A1CF,APOBEC1 complementation factor,,ASP,ENSG00000148584,29974.0
6,HGNC:24086,A1CF,APOBEC1 complementation factor,,ACF64,ENSG00000148584,29974.0
7,HGNC:24086,A1CF,APOBEC1 complementation factor,,ACF65,ENSG00000148584,29974.0
8,HGNC:24086,A1CF,APOBEC1 complementation factor,,APOBEC1CF,ENSG00000148584,29974.0


In [25]:
y.columns

Index(['#chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand',
       'thickStart', 'thickEnd', 'reserved', 'blockCount', 'blockSizes',
       'chromStarts', 'name2', 'cdsStartStat', 'cdsEndStat', 'exonFrames',
       'type', 'geneName', 'geneName2', 'geneType', 'transcriptClass',
       'source', 'transcriptType', 'tag', 'level', 'tier'],
      dtype='object')

In [46]:
gget.info("AL513220.1")

