# Imputed gene expression 
## Introduction

In this notebook, there are several sections that describe the refactored function.
- GenotypeDataset function
    - Revised full function
    - Decomposing code for testing
      + get_all_dosages
      + get_reference


In [None]:
# Import libaraies

import numpy as np
import pandas as pd
import sqlite3
import sys
import os
import re
from pathlib import Path
from tqdm import tqdm

#### <span style="color:orange">REFACTORED Here we refactored the function GenotypeDataset into the `imputed_predixcan.py`  modules.</span>
+ <span style="color:orange">REFACTORED</span> Revised full function

In [None]:
class WeightsDB():
    
    def __init__(self,
                 beta_file):
        self.conn = sqlite3.connect(beta_file)

    def query(self,
              sql,
              args=None):

        c = self.conn.cursor()
        if args:
            for ret in c.execute(sql, args):
                yield ret
        else:
            for ret in c.execute(sql):
                yield ret


class GenotypeDataset():

    @staticmethod
    def get_all_dosages(genoytpe_dir,
                        dosage_prefix,
                        dosage_end_prefix,
                        unique_rsids,
                        reference_file):

        for chrfile in [
                x for x in sorted(genoytpe_dir.iterdir())
                if x.name.startswith(str(dosage_prefix))
                and x.name.endswith(str(dosage_end_prefix))
        ]:
            # get chr number
            chr_name = os.path.basename(chrfile).split(".")[0]
            chr_number = re.findall(r'\d+', chr_name)
            # get reference file with specific chr numner
            get_ref_chrfile = reference_file.loc[reference_file.CHR == chr_number[0]]
            get_ref_chrfile = get_ref_chrfile.loc[get_ref_chrfile['SNP'].isin(unique_rsids)]
            # create chr_bp column
            get_ref_chrfile['chr_bp'] = get_ref_chrfile.apply(lambda x: str(x['CHR']) + ":" + str(x['BP']), axis=1)
            get_ref_chrfile.drop_duplicates(["chr_bp"], inplace=True)

            with open(str(chrfile), 'rt') as file:
                for line_index, line in enumerate(file):

                    if line_index <= 0:
                        continue

                    arr = line.strip().split()
                    chr_bp = arr[0]
                    refallele = arr[2]
                    dosage_row = np.array(arr[3:], dtype=np.float64)

                    get_rsid = get_ref_chrfile['chr_bp'].astype(str).isin([str(chr_bp)])
                    if any(get_rsid):
                        rsid = get_ref_chrfile.loc[get_rsid]['SNP'].tolist()[0]
                        yield rsid, refallele, dosage_row
                    else:
                        continue

    @staticmethod
    def UniqueRsid(beta_file):
        res = [
            x[0] for x in WeightsDB(beta_file).query(
                "SELECT distinct rsid FROM weights")
        ]
        return res

    @staticmethod
    def get_reference(file, 
                      chunkSize=1000000, 
                      parition=661):
        reader = pd.read_csv(file, sep="\t", iterator=True)
        chunks = []
        with tqdm(range(parition)) as pbar:
            for _ in pbar:
                try:
                    chunk = reader.get_chunk(chunkSize)
                    chunks.append(chunk)
                except StopIteration:
                    break
        return  pd.concat(chunks, ignore_index=True)


beta_file = "/exeh_4/yuping/Epistasis_Interaction/data/models_all/gtex_v7_Cells_Transformed_fibroblasts_imputed_europeans_tw_0.5_signif.db"
unique_rsids = GenotypeDataset.UniqueRsid(beta_file)
reference_file = GenotypeDataset.get_reference("/mnt/data/share/yuping/snp151_GRCh37.txt")
genotype_dir = Path("/mnt/data/share/yuping/dosage")
dosage_prefix = "chr"
dosage_end_prefix = ".dosage"

for rsid, allele, dosage_row in GenotypeDataset.get_all_dosages(genotype_dir, 
                                                                dosage_prefix,
                                                                dosage_end_prefix,
                                                                unique_rsids,
                                                                reference_file):
    print(rsid)

+ <span style="color:orange">REFACTORED</span> Decomposing code for testing - `get_all_dosages`

In [None]:
beta_file = "/mnt/data/share/yuping/data/models_all/gtex_v7_Cells_Transformed_fibroblasts_imputed_europeans_tw_0.5_signif.db"
unqiue_rsids = GenotypeDataset.UniqueRsid(beta_file)
genotype_dir = Path("/mnt/data/share/yuping/dosage")
dosage_prefix = "chr"
dosage_end_prefix = ".dosage"

for chrfile in [
        x for x in sorted(genotype_dir.iterdir())
        if x.name.startswith(str(dosage_prefix))
        and x.name.endswith(str(dosage_end_prefix))
]:

    chr_name = os.path.basename(chrfile).split(".")[0]
    chr_number = re.findall(r'\d+', chr_name)
    ref_file = reference_file.loc[reference_file.CHR == chr_number[0]]
    ref_file = ref_file.loc[ref_file['SNP'].isin(unqiue_rsids)]

    ref_file['chr_bp'] = ref_file.apply(
        lambda x: str(x['CHR']) + ":" + str(x['BP']), axis=1)
    ref_file.drop_duplicates(["chr_bp"], inplace=True)

    print(chr_number)

    with open(str(chrfile), "rt") as file:
        for line_index, line in enumerate(file):

            if line_index <= 0:
                continue

            arr = line.strip().split()
            chr_bp = arr[0]
            refallele = arr[2]
            dosage_row = np.array(arr[3:], dtype=np.float64)

            get_rsid = ref_file['chr_bp'].astype(str).isin([str(chr_bp)])
            if any(get_rsid):
                rsid = ref_file.loc[get_rsid]['SNP'].tolist()[0]
            else:
                continue

+ <span style="color:orange">REFACTORED</span> Decomposing code for testing - `get_reference`

In [None]:
def get_reference(file, chunkSize=1000000, parition=661):
    reader = pd.read_csv(file, sep = "\t", iterator=True)
    chunks = []
    with tqdm(range(parition)) as pbar:
        for _ in pbar:
            try:
                chunk = reader.get_chunk(chunkSize)
                chunks.append(chunk)
            except StopIteration:
                break

    return pd.concat(chunks,ignore_index=True)

file = get_reference("/mnt/data/share/yuping/snp151_GRCh37.txt")