In [4]:
!pip install -q cyvcf2 pandas google-cloud-storage

import pandas as pd
from cyvcf2 import VCF
from pathlib import Path
from google.cloud import storage

# --- Paths ---
vcf_path = Path("clinvar_data/vcf_GRCh38/clinvar_20250928.vcf.gz")
local_csv = Path("clinvar_data/vcf_GRCh38/clinvar_20250928.csv")

# --- GCS settings ---
PROJECT = "instr-cs795-fall25-hqin-1"
BUCKET_NAME = "instr-cs795-fall25-hqin-1-tstilwel"
GCS_BLOB = "clinvar/clinvar_20250928.csv"   # path inside bucket

# --- INFO fields to keep ---
INFO_FIELDS = ["CLNSIG", "CLNREVSTAT", "CLNDN", "GENEINFO", "MC"]

def explode_variant(rec):
    base = {
        "chrom": rec.CHROM,
        "pos": rec.POS,
        "id": rec.ID,
        "ref": rec.REF,
        "qual": rec.QUAL,
        "filter": ";".join(rec.FILTER or []),
    }
    for f in INFO_FIELDS:
        base[f.lower()] = rec.INFO.get(f, None)
    rows = []
    for alt in rec.ALT or [""]:
        r = dict(base)
        r["alt"] = alt
        rows.append(r)
    return rows

# --- Convert VCF → CSV ---
vcf = VCF(str(vcf_path))
buffer, header_written = [], False
chunk_size = 200_000

with local_csv.open("w", encoding="utf-8") as f:
    for rec in vcf:
        buffer.extend(explode_variant(rec))
        if len(buffer) >= chunk_size:
            df = pd.DataFrame(buffer); buffer.clear()
            df.to_csv(f, index=False, header=not header_written)
            header_written = True
    if buffer:
        df = pd.DataFrame(buffer)
        df.to_csv(f, index=False, header=not header_written)

print(f"✅ Finished! CSV written to {local_csv}")

# --- Upload to GCS ---
client = storage.Client(project=PROJECT)
bucket = client.bucket(BUCKET_NAME)
blob = bucket.blob(GCS_BLOB)
blob.upload_from_filename(local_csv)

print(f"✅ Uploaded to gs://{BUCKET_NAME}/{GCS_BLOB}")

✅ Finished! CSV written to clinvar_data/vcf_GRCh38/clinvar_20250928.csv
✅ Uploaded to gs://instr-cs795-fall25-hqin-1-tstilwel/clinvar/clinvar_20250928.csv
