In [5]:
!pip install pysam

import pysam
import requests
import gzip
import os

# --- Task 1: Download the ClinVar VCF Dataset ---

clinvar_vcf_url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz"
clinvar_vcf_local_path = "clinvar.vcf.gz"

# Check if file already exists to avoid re-downloading
if not os.path.exists(clinvar_vcf_local_path):
    print(f"Downloading ClinVar VCF from: {clinvar_vcf_url}")
    try:
        response = requests.get(clinvar_vcf_url, stream=True)
        response.raise_for_status() # Raise an exception for bad status codes

        with open(clinvar_vcf_local_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"ClinVar VCF downloaded to: {clinvar_vcf_local_path}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading ClinVar VCF: {e}")
else:
    print(f"Using existing ClinVar VCF file: {clinvar_vcf_local_path}")


# --- Task 2: Download the GRCh38 Reference Genome (Chromosome 22 FASTA) ---

# Updated URL for the reference genome
grch38_chr22_url = "https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.22.fa.gz"
grch38_chr22_local_path = "chr22.fa.gz"
grch38_chr22_uncompressed_path = "chr22.fa"

# Check if file already exists
if not os.path.exists(grch38_chr22_uncompressed_path):
    print(f"Downloading GRCh38 Chromosome 22 FASTA from: {grch38_chr22_url}")
    try:
        response = requests.get(grch38_chr22_url, stream=True)
        response.raise_for_status()

        with open(grch38_chr22_local_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"GRCh38 Chromosome 22 FASTA downloaded to: {grch38_chr22_local_path}")

        # Decompress the FASTA file
        with gzip.open(grch38_chr22_local_path, 'rb') as f_in:
            with open(grch38_chr22_uncompressed_path, 'wb') as f_out:
                f_out.writelines(f_in)
        print(f"GRCh38 Chromosome 22 FASTA decompressed to: {grch38_chr22_uncompressed_path}")

        # Create the FASTA index file (.fai), which is required by pysam.FastaFile
        print(f"Indexing FASTA file: {grch38_chr22_uncompressed_path}")
        pysam.faidx(grch38_chr22_uncompressed_path)
        print("FASTA index created.")

    except requests.exceptions.RequestException as e:
        print(f"Error downloading reference FASTA: {e}")
    except Exception as e:
        print(f"An error occurred during FASTA processing: {e}")
else:
    # Ensure the index exists if the file already exists
    if not os.path.exists(grch38_chr22_uncompressed_path + '.fai'):
        print(f"FASTA file found, but index is missing. Indexing file: {grch38_chr22_uncompressed_path}")
        pysam.faidx(grch38_chr22_uncompressed_path)
        print("FASTA index created.")
    else:
        print(f"Using existing reference FASTA file and index: {grch38_chr22_uncompressed_path}")


# --- Task 3: Define and Implement a VCF Exploration Function ---

def explore_vcf_summary(vcf_path, num_records=10):
    """
    Explores a VCF file, printing its header and a summary of the first few variants.

    Args:
      vcf_path (str): Path to the VCF file (can be gzipped).
      num_records (int): The number of records to display from the beginning of the VCF.
    """
    if not os.path.exists(vcf_path):
        print(f"Error: VCF file not found at '{vcf_path}'")
        return

    try:
        # Open the VCF file using pysam.VariantFile
        # pysam can handle gzipped files directly
        with pysam.VariantFile(vcf_path, 'r') as vcf_file:
            print("--- VCF Header (Metadata) ---")
            # The header object contains all metadata
            for record in vcf_file.header.records:
                print(str(record).strip())

            print("\n--- Column Names ---")
            # The last line of the full header string is the column definition
            header_str = str(vcf_file.header).strip()
            column_line = header_str.split('\n')[-1]
            print(column_line)

            print(f"\n--- First {num_records} Variant Records ---")
            # Iterate over the first `num_records` variants
            for i, variant_record in enumerate(vcf_file):
                if i >= num_records:
                    break

                # Extract CLNSIG from the INFO field. It can be a tuple.
                clnsig_values = variant_record.info.get('CLNSIG', ('N/A',))
                clnsig_str = ','.join(str(s) for s in clnsig_values)

                # Alleles can also be a tuple
                alts_str = ','.join(str(a) for a in variant_record.alts)

                print(
                    f"\nRecord {i+1}:\n"
                    f"  CHROM : {variant_record.chrom}\n"
                    f"  POS   : {variant_record.pos}\n"
                    f"  ID    : {variant_record.id if variant_record.id else 'N/A'}\n"
                    f"  REF   : {variant_record.ref}\n"
                    f"  ALT   : {alts_str}\n"
                    f"  CLNSIG: {clnsig_str}"
                )

    except Exception as e:
        print(f"An error occurred while processing the VCF file: {e}")


# --- Call the function to demonstrate its usage ---
print("\n--- Exploring VCF Summary ---")
explore_vcf_summary(clinvar_vcf_local_path, num_records=10)


# --- Demonstrate usage of the downloaded FASTA file ---
print("\n--- Accessing Reference Genome ---")
try:
    # The chromosome in the new FASTA file is named '22', not 'chr22'
    # We also need to rename the file to remove the extra text from the Ensembl URL
    fasta_file_path = grch38_chr22_uncompressed_path
    if os.path.exists(fasta_file_path + ".fai"):
        with pysam.FastaFile(fasta_file_path) as fasta_file:
            # Fetch a small region from chromosome 22. Note the region is now '22', not 'chr22'
            region = "22:16050000-16050050"
            sequence = fasta_file.fetch(region=region)
            print(f"Successfully read reference sequence.")
            print(f"Sequence from {region}: {sequence}")
    else:
        print("FASTA index file not found. Skipping reference access.")
except Exception as e:
    print(f"An error occurred while reading the FASTA file: {e}")

Using existing ClinVar VCF file: clinvar.vcf.gz
Downloading GRCh38 Chromosome 22 FASTA from: https://ftp.ensembl.org/pub/release-109/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.chromosome.22.fa.gz
GRCh38 Chromosome 22 FASTA downloaded to: chr22.fa.gz
GRCh38 Chromosome 22 FASTA decompressed to: chr22.fa
Indexing FASTA file: chr22.fa
FASTA index created.

--- Exploring VCF Summary ---
--- VCF Header (Metadata) ---
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##fileDate=2025-09-28
##source=ClinVar
##reference=GRCh38
##ID=<Description="ClinVar Variation ID">
##INFO=<ID=AF_ESP,Number=1,Type=Float,Description="allele frequencies from GO-ESP">
##INFO=<ID=AF_EXAC,Number=1,Type=Float,Description="allele frequencies from ExAC">
##INFO=<ID=AF_TGP,Number=1,Type=Float,Description="allele frequencies from TGP">
##INFO=<ID=ALLELEID,Number=1,Type=Integer,Description="the ClinVar Allele ID">
##INFO=<ID=CLNDN,Number=.,Type=String,Description="ClinVar's preferred disease na