In [16]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pyreadr
import os

In [36]:
def get_pos_df(num):
    weights_dir = '../data/weights'
    pos_df = pd.read_csv(f"{weights_dir}/chr{num}/chr{num}_weights.pos", sep="\t")
    return pos_df


In [51]:

def get_weights_df(num):
    weights_dir = '../data/weights'

    rdata_folder = f"{weights_dir}/chr{num}"

    rdata_files = [f for f in os.listdir(rdata_folder) if f.endswith(".RDat")]

    dataframes = []

    for file in rdata_files:
        file_path = os.path.join(rdata_folder, file)

        # print(file_path)
        
        # Load the .RData file
        result = pyreadr.read_r(file_path)
        
        # Extract the SNP weight matrix (assuming it's stored under "wgt.matrix")
        if 'wgt.matrix' in result:
            wgt_matrix = result['wgt.matrix']
            
            # Reset index so that rsID is a column
            wgt_matrix.reset_index(inplace=True)
            wgt_matrix.rename(columns={'index': 'rsID'}, inplace=True)
            wgt_matrix['WGT'] = file
            cols = ['WGT'] + [col for col in wgt_matrix.columns if col != 'WGT']
            wgt_matrix = wgt_matrix[cols]
            
            # Append to the list
            dataframes.append(wgt_matrix)
    weights_df = pd.concat(dataframes, ignore_index=True)

    return weights_df


In [52]:
get_weights_df(1)

Unnamed: 0,WGT,rsID,enet,lasso,top1
0,ENSG00000132185.12_chr1.wgt.RDat,rs4656994,0.02952,0.0,1.735461
1,ENSG00000132185.12_chr1.wgt.RDat,rs10797094,0.00000,0.0,-0.168719
2,ENSG00000132185.12_chr1.wgt.RDat,rs12094497,0.00000,0.0,0.911726
3,ENSG00000132185.12_chr1.wgt.RDat,rs2307420,0.00000,0.0,-0.471178
4,ENSG00000132185.12_chr1.wgt.RDat,rs2501873,0.00000,0.0,0.363678
...,...,...,...,...,...
12316,ENSG00000131238.10_chr1.wgt.RDat,rs7512061,0.00000,0.0,0.486667
12317,ENSG00000131238.10_chr1.wgt.RDat,rs12068587,0.00000,0.0,-0.138601
12318,ENSG00000131238.10_chr1.wgt.RDat,rs10489165,0.00000,0.0,0.066173
12319,ENSG00000131238.10_chr1.wgt.RDat,rs4660438,0.00000,0.0,-1.169569


In [54]:
pos_df = get_pos_df(1)
weights_df = get_weights_df(1)
# merged_df = pos_df.merge(weights_df, on="rsID", how="inner")
pos_df

Unnamed: 0,WGT,ID,CHR,P0,P1
0,ENSG00000182472.3_chr19.wgt.RDat,CAPN12,19,38733702,38769904
1,ENSG00000182628.7_chr17.wgt.RDat,SKA2,17,59109950,59155269
2,ENSG00000137501.11_chr11.wgt.RDat,SYTL2,11,85694223,85811141
3,ENSG00000167747.8_chr19.wgt.RDat,C19orf48,19,50797703,50804929
4,ENSG00000242779.1_chr19.wgt.RDat,ZNF702P,19,52974041,53037898
...,...,...,...,...,...
889,ENSG00000175866.10_chr17.wgt.RDat,BAIAP2,17,81035149,81116225
890,ENSG00000102554.7_chr13.wgt.RDat,KLF5,13,73054975,73077538
891,ENSG00000131238.10_chr1.wgt.RDat,PPT1,1,40072706,40097703
892,ENSG00000077420.9_chr10.wgt.RDat,APBB1IP,10,26438202,26567803


In [55]:
weights_df

Unnamed: 0,WGT,rsID,enet,lasso,top1
0,ENSG00000132185.12_chr1.wgt.RDat,rs4656994,0.02952,0.0,1.735461
1,ENSG00000132185.12_chr1.wgt.RDat,rs10797094,0.00000,0.0,-0.168719
2,ENSG00000132185.12_chr1.wgt.RDat,rs12094497,0.00000,0.0,0.911726
3,ENSG00000132185.12_chr1.wgt.RDat,rs2307420,0.00000,0.0,-0.471178
4,ENSG00000132185.12_chr1.wgt.RDat,rs2501873,0.00000,0.0,0.363678
...,...,...,...,...,...
12316,ENSG00000131238.10_chr1.wgt.RDat,rs7512061,0.00000,0.0,0.486667
12317,ENSG00000131238.10_chr1.wgt.RDat,rs12068587,0.00000,0.0,-0.138601
12318,ENSG00000131238.10_chr1.wgt.RDat,rs10489165,0.00000,0.0,0.066173
12319,ENSG00000131238.10_chr1.wgt.RDat,rs4660438,0.00000,0.0,-1.169569


In [57]:
merged_df = pos_df.merge(weights_df, on="WGT", how="inner")
# merged_df = merged_df[(merged_df["P0"] <= merged_df["position"]) & (merged_df["P1"] >= merged_df["position"])]
merged_df

Unnamed: 0,WGT,ID,CHR,P0,P1,rsID,enet,lasso,top1
0,ENSG00000132185.12_chr1.wgt.RDat,FCRLA,1,161706971,161714352,rs4656994,0.02952,0.0,1.735461
1,ENSG00000132185.12_chr1.wgt.RDat,FCRLA,1,161706971,161714352,rs10797094,0.00000,0.0,-0.168719
2,ENSG00000132185.12_chr1.wgt.RDat,FCRLA,1,161706971,161714352,rs12094497,0.00000,0.0,0.911726
3,ENSG00000132185.12_chr1.wgt.RDat,FCRLA,1,161706971,161714352,rs2307420,0.00000,0.0,-0.471178
4,ENSG00000132185.12_chr1.wgt.RDat,FCRLA,1,161706971,161714352,rs2501873,0.00000,0.0,0.363678
...,...,...,...,...,...,...,...,...,...
9061,ENSG00000131238.10_chr1.wgt.RDat,PPT1,1,40072706,40097703,rs7512061,0.00000,0.0,0.486667
9062,ENSG00000131238.10_chr1.wgt.RDat,PPT1,1,40072706,40097703,rs12068587,0.00000,0.0,-0.138601
9063,ENSG00000131238.10_chr1.wgt.RDat,PPT1,1,40072706,40097703,rs10489165,0.00000,0.0,0.066173
9064,ENSG00000131238.10_chr1.wgt.RDat,PPT1,1,40072706,40097703,rs4660438,0.00000,0.0,-1.169569


In [58]:
def pos_weights_matrix(num):
    pos_df = get_pos_df(num)
    weights_df = get_weights_df(num)
    merged_df = pos_df.merge(weights_df, on="WGT", how="inner")
    # merged_df = merged_df[(merged_df["P0"] <= merged_df["position"]) & (merged_df["P1"] >= merged_df["position"])]
    return merged_df

In [59]:
pd.concat([pos_weights_matrix(num) for num in range(1, 23)])

Unnamed: 0,WGT,ID,CHR,P0,P1,rsID,enet,lasso,top1
0,ENSG00000132185.12_chr1.wgt.RDat,FCRLA,1,161706971,161714352,rs4656994,0.029520,0.0,1.735461
1,ENSG00000132185.12_chr1.wgt.RDat,FCRLA,1,161706971,161714352,rs10797094,0.000000,0.0,-0.168719
2,ENSG00000132185.12_chr1.wgt.RDat,FCRLA,1,161706971,161714352,rs12094497,0.000000,0.0,0.911726
3,ENSG00000132185.12_chr1.wgt.RDat,FCRLA,1,161706971,161714352,rs2307420,0.000000,0.0,-0.471178
4,ENSG00000132185.12_chr1.wgt.RDat,FCRLA,1,161706971,161714352,rs2501873,0.000000,0.0,0.363678
...,...,...,...,...,...,...,...,...,...
2768,ENSG00000100311.12_chr22.wgt.RDat,PDGFB,22,39223358,39244751,rs4820389,0.000000,0.0,-1.206085
2769,ENSG00000100311.12_chr22.wgt.RDat,PDGFB,22,39223358,39244751,rs12484428,0.000000,0.0,-0.219725
2770,ENSG00000100311.12_chr22.wgt.RDat,PDGFB,22,39223358,39244751,rs9611212,0.000000,0.0,-0.688401
2771,ENSG00000100311.12_chr22.wgt.RDat,PDGFB,22,39223358,39244751,rs5757783,-0.031469,0.0,-1.360252
