# Curated dataset ALS for LD SNPs predicition

## Load Libraries

In [1]:
# Hugging Face Datasets library
from datasets import load_dataset, Dataset

# Scikit-learn for evaluation and data splitting
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Biopython for sequence processing
from Bio import SeqIO

# PyBedTools for genomic interval manipulation
from pybedtools import BedTool

# Data analysis and scientific computation
import pandas as pd
import numpy as np
import scipy.stats as stats

# PyTorch for deep learning
import torch
from torch import nn

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Utilities and external libraries
import os
import pickle
import genopyc
import utility_functions as uf # Custom utility functions for loading models/tokenizers, datasets from the experiments

from concurrent.futures import ThreadPoolExecutor
import time

2025-02-21 16:27:30.468881: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-21 16:27:30.468944: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-21 16:27:30.652544: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-21 16:27:31.088758: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load datasete raw

In [4]:
## get all file in a directory
path_datasets = "/data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/"
files = os.listdir(path_datasets)
files = [file for file in files if file.endswith('.tsv')]

## read tsv file
path_dataset = path_datasets+files[0]
df = pd.read_csv(path_dataset, sep='\t')
df

Unnamed: 0,Variation ID,Location,Distance from rs10122902,Consequence Type,r2,D',Phenotypes,GWAS_Trait_Category,Source
0,rs10122902,9:27556782-27556782,0,synonymous_variant,undefined,undefined,Amyotrophic lateral sclerosis;Amyotrophic late...,Other trait,Ensembl
1,rs72727506,9:27541834-27541834,-14948,intergenic_variant,0.916667,1.000000,,No trait reported,Ensembl
2,rs4879564,9:27557835-27557835,1053,intron_variant,0.956790,1.000000,,No trait reported,Ensembl
3,rs4878487,9:27552489-27552489,-4293,intron_variant,0.956790,1.000000,,No trait reported,Ensembl
4,rs36038669,9:27536735-27536735,-20047,intergenic_variant,0.844444,1.000000,,No trait reported,Ensembl
5,rs67483205,9:27541914-27541914,-14868,intergenic_variant,0.916667,1.000000,,No trait reported,Ensembl
6,rs3739526,9:27547315-27547315,-9467,3_prime_UTR_variant,1.000000,1.000000,FRONTOTEMPORAL DEMENTIA AND/OR AMYOTROPHIC LAT...,Neurological disorder,Ensembl


## get SNPs info: chromosome, position, reference, alternative, snp_id

In [5]:
def process_snp_candidate(list_snps):
    local_data = []
    try:
        # Fetch variant information
        info_ld = genopyc.get_variants_info(list_snps, chunked=False, chunksize=200)
        
        # Extract and filter relevant data
        for snp_id, snp_info in info_ld.items():
            mappings = snp_info.get('mappings', [])
            if not mappings:
                continue

            allele_string = mappings[0].get('allele_string', '')
            location = mappings[0].get('location', '')
            if not allele_string or not location:
                continue

            # Parse alleles and position
            alleles = allele_string.split('/')
            pos = location.split(':')
            #if len(pos) != 2 or len(alleles) != 2:
            #    continue

            chromosome = f"chr{pos[0]}"
            position = int(pos[1].split('-')[0])
            reference = alleles[0]

            alternative = snp_info.get('minor_allele', None)
            if alternative is None:
                alternative = alleles[1]
                
            if reference == '-' or alternative=='-':
                continue

            # Include only SNPs (single nucleotide changes)
            #if len(reference) == 1 and len(alternative) == 1:
            local_data.append([chromosome, position, reference, alternative, snp_id])
    except Exception as e:
        print(f"Error processing {list_snps}: {e}")
    return local_data


## create a df with the SNPs
df_snps = pd.DataFrame(process_snp_candidate(df['Variation ID'].tolist()), columns=['Chromosome', 'Position', 'Reference', 'Alternative', 'Variation ID'])

df_snps

Unnamed: 0,Chromosome,Position,Reference,Alternative,Variation ID
0,chr9,27541914,C,A,rs67483205
1,chr9,27557835,C,T,rs4879564
2,chr9,27536735,A,G,rs36038669
3,chr9,27541834,C,T,rs72727506
4,chr9,27552489,T,C,rs4878487
5,chr9,27547315,G,C,rs3739526
6,chr9,27556782,G,A,rs10122902


## Main: make it for every dataset in the folder

In [7]:
## get all file in a directory
path_datasets = "/data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/"
files = os.listdir(path_datasets)
files = [file for file in files if file.endswith('.tsv')]

## create a folder curated if not exists
path_to_save = path_datasets+'curated/'
if not os.path.exists(path_datasets+'curated/'):
    os.makedirs(path_to_save)

## save the df
for file in files: 
    path_dataset = path_datasets+file
    df = pd.read_csv(path_dataset, sep='\t')
    df_snps = pd.DataFrame(process_snp_candidate(df['Variation ID'].tolist()), columns=['Chromosome', 'Position', 'Reference', 'Alternative', 'Variation ID'])
    df_snps.to_csv(path_to_save+file, sep='\t', index=False)
    print(f"{file} saved in {path_to_save}")

rs10122902.tsv saved in /data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/curated/
rs1031153.tsv saved in /data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/curated/
rs117761967.tsv saved in /data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/curated/
rs2453555.tsv saved in /data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/curated/
rs2814707.tsv saved in /data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/curated/
rs3849942.tsv saved in /data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/curated/
rs3849943.tsv saved in /data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/curated/
total number of chunks: 2
chunk 0 processed
chunk 1 processed
rs73440933.tsv saved in /data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/curated/
rs774359.tsv saved in /data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/curated/


In [28]:
## read pickle file 
with open('/data/Dcode/gaetano/repos/AI4Genomic/data/ALS/gene_c9/LDs/predictions/trednet_LD_C9_ALS_rs1031153.tsv.pkl', 'rb') as f:
    df = pickle.load(f, encoding='latin1')
df

{'predictions_alt': array([[3.9635066e-02],
        [3.5207569e-05],
        [2.1966679e-03],
        [1.4339499e-03],
        [6.6309660e-03],
        [6.4007536e-04],
        [3.6650585e-05]], dtype=float32),
 'predictions_ref': array([[2.0876354e-02],
        [3.4290199e-05],
        [2.3022976e-03],
        [9.1243640e-04],
        [2.2561645e-02],
        [6.6661910e-04],
        [4.7790218e-05]], dtype=float32)}