In [23]:
import pandas as pd 
import numpy as np
import os
from glob import glob

In [24]:
db_dir = "/cellar/users/zkoch/methylation_and_mutation/data/meQTL"

In [None]:
"""
Put each DB into a parquet file with columns: cpg: strings of cpg ids, beta: float beta values, snp: strings of snp ids locations chr:start
"""

In [19]:
goDMC = pd.read_parquet(os.path.join(db_dir, "goDMC_meQTL/goDMC_meQTLs_for_mutClock.parquet"))

In [20]:
goDMC.columns = ["cpg", "beta", "snp"]

In [22]:
goDMC.to_parquet(os.path.join(db_dir, "goDMC_meQTL/goDMC_meQTLs_for_mutClock.parquet"))

### Huan

In [24]:
all_haun_dfs = []
for fn in glob(os.path.join(db_dir, "huan_meqtl", "*.csv.gz")):
    one_huan_df = pd.read_csv(fn, sep = ",")
    keep_cols = ["CpG", "beta", "Marker"]
    # rename columns
    one_huan_df = one_huan_df[keep_cols]
    one_huan_df.columns = ["cpg", "beta", "snp"]
    all_haun_dfs.append(one_huan_df)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [25]:
huan = pd.concat(all_haun_dfs)
huan.reset_index(drop = True, inplace = True)

In [26]:
huan.to_parquet(os.path.join(db_dir, "huan_meqtl/huan_meQTLs_for_mutClock.parquet"))

### Villicana

In [27]:
villicana_df = pd.read_csv(os.path.join(db_dir, "villicana_meqtls", "top_meQTL.txt"), sep = "\t")
villicana_df = villicana_df[['CpG', 'Beta', 'Top SNP']]

In [None]:
# split top SNP of _ and take first part
villicana_df['snp'] = villicana_df['Top SNP'].apply(lambda x: x.split("_")[0])
# drop top SNP
villicana_df.drop('Top SNP', axis = 1, inplace = True)
# rename columns
villicana_df.columns = ['cpg', 'beta', 'snp']

In [35]:
villicana_df.to_parquet(os.path.join(db_dir, "villicana_meqtls/villicana_meQTLs_for_mutClock.parquet"))

### Hannon

In [36]:
hannon_df = pd.read_csv(os.path.join(db_dir, "hannon_meqtls", "All_Imputed_BonfSignificant_mQTLs.csv.gz"), sep = ",")

In [38]:
hannon_df = hannon_df[['ProbeID', 'beta', 'SNP_Chr', 'SNP_BP']]

In [40]:
hannon_df['snp'] = hannon_df['SNP_Chr'].astype(str) + ":" + hannon_df['SNP_BP'].astype(str)
hannon_df.drop(['SNP_Chr', 'SNP_BP'], axis = 1, inplace = True)
hannon_df.columns = ['cpg', 'beta', 'snp']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hannon_df['snp'] = hannon_df['SNP_Chr'].astype(str) + ":" + hannon_df['SNP_BP'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [42]:
hannon_df.to_parquet(os.path.join(db_dir, "hannon_meqtls/hannon_meQTLs_for_mutClock.parquet"))

### Arcelus

In [43]:
fns = glob(os.path.join(db_dir, "arcelus_meqtls/GENCORD2_ASSOCIATIONS/*MQTL*"))

In [45]:
all_arcelus_dfs = []
for fn in fns:
    df = pd.read_csv(fn, sep = "\t")
    df = df[['METHYL_ID', 'rvalue', 'SNP_chromosome', 'SNP_location']]
    df['snp'] = df['SNP_chromosome'].astype(str) + ":" + df['SNP_location'].astype(str)
    df.drop(['SNP_chromosome', 'SNP_location'], axis = 1, inplace = True)
    df.columns = ['cpg', 'beta', 'snp']
    all_arcelus_dfs.append(df)

In [46]:
all_arcelus_df = pd.concat(all_arcelus_dfs)
all_arcelus_df.reset_index(drop = True, inplace = True)

In [52]:
all_arcelus_df.to_parquet(os.path.join(db_dir, "arcelus_meqtls/arcelus_meQTLs_for_mutClock.parquet"))

### mqtl db: Need to get SNP mapping file

In [4]:
snp_map = pd.read_csv(os.path.join(db_dir, "mqtl_db","ariesmqtlsnps.bim"), sep = "\t", header = None)
snp_map.columns = ['chr', 'snp', '?', 'pos', 'a1', 'a2']

In [5]:
fns = glob(os.path.join(db_dir, "mqtl_db", "*.tab"))
all_mqtl_dfs = []
for fn in fns:
    one_mqtl_df = pd.read_csv(fn, sep = "\t")
    one_mqtl_df = one_mqtl_df[['gene', 'beta', 'SNP']]
    all_mqtl_dfs.append(one_mqtl_df)

In [6]:
all_mqtl_df = pd.concat(all_mqtl_dfs)
all_mqtl_df.reset_index(drop = True, inplace = True)

In [7]:
# map SNP to chr and pos 
all_mqtl_df = all_mqtl_df.merge(snp_map, left_on = 'SNP', right_on = 'snp', how = 'left')

In [8]:
all_mqtl_df.drop(['snp', 'SNP', 'a1', 'a2', '?'], axis = 1, inplace = True)

In [13]:
all_mqtl_df.dropna(inplace = True)

In [14]:
# convert chr and pos columns from float to int
all_mqtl_df['chr'] = all_mqtl_df['chr'].astype(int)
all_mqtl_df['pos'] = all_mqtl_df['pos'].astype(int)


In [15]:
all_mqtl_df['snp'] = all_mqtl_df['chr'].astype(str) + ":" + all_mqtl_df['pos'].astype(str)

In [16]:
all_mqtl_df

Unnamed: 0,gene,beta,chr,pos,snp
0,cg12500956,0.874594,2,135428054,2:135428054
1,cg12500956,0.874594,2,135429002,2:135429002
2,cg12500956,0.874594,2,135430621,2:135430621
3,cg12500956,0.874594,2,135433023,2:135433023
4,cg12500956,0.874594,2,135434849,2:135434849
...,...,...,...,...,...
43956472,cg05127548,-0.261207,1,33789968,1:33789968
43956473,cg25368083,-0.421091,1,33137199,1:33137199
43956474,cg14098468,-0.218124,1,31318604,1:31318604
43956475,cg01815720,0.251016,1,31308177,1:31308177


In [17]:
all_mqtl_df.drop(['chr', 'pos'], axis = 1, inplace = True)

In [27]:
all_mqtl_df.columns = ['cpg', 'beta', 'snp']

In [28]:
all_mqtl_df.to_parquet(os.path.join(db_dir, "mqtl_db/mqtl_db_for_mutClock.parquet"))

In [30]:
all_mqtl_df = pd.read_parquet(os.path.join(db_dir, "mqtl_db/mqtl_db_for_mutClock.parquet"))

In [31]:
all_mqtl_df

Unnamed: 0,cpg,beta,snp
0,cg12500956,0.874594,2:135428054
1,cg12500956,0.874594,2:135429002
2,cg12500956,0.874594,2:135430621
3,cg12500956,0.874594,2:135433023
4,cg12500956,0.874594,2:135434849
...,...,...,...
43956472,cg05127548,-0.261207,1:33789968
43956473,cg25368083,-0.421091,1:33137199
43956474,cg14098468,-0.218124,1:31318604
43956475,cg01815720,0.251016,1:31308177
