In [9]:
import os
import pandas as pd
import numpy as np
import subprocess
import urllib.request
import glob
import gzip

In [6]:
# Define the directory containing your files
data_dir = "../data/LDREF/"

# Find all bed files in the directory
bed_files = sorted(glob.glob(os.path.join(data_dir, "1000G.EUR.*.bed")))

# Initialize counters
total_individuals = 0
total_snps = 0
individual_count_set = set()  # To ensure consistency across chromosomes

for bed_file in bed_files:
    base_name = bed_file.replace(".bed", "")
    
    # Define corresponding .fam and .bim file paths
    fam_file = base_name + ".fam"
    bim_file = base_name + ".bim"

    # Count individuals from the .fam file (each row represents an individual)
    with open(fam_file, "r") as f:
        num_individuals = sum(1 for _ in f)
    
    # Count SNPs from the .bim file (each row represents a SNP)
    with open(bim_file, "r") as f:
        num_snps = sum(1 for _ in f)
    
    # Store individual counts to verify consistency across chromosomes
    individual_count_set.add(num_individuals)
    
    # Accumulate SNP counts
    total_snps += num_snps

    print(f"{os.path.basename(bed_file)}: {num_individuals} individuals, {num_snps} SNPs")

# Check if the number of individuals is consistent across chromosomes
if len(individual_count_set) > 1:
    print("\nWarning: The number of individuals varies across chromosomes:", individual_count_set)
else:
    total_individuals = individual_count_set.pop()
    print("\nTotal individuals (consistent across chromosomes):", total_individuals)

print("Total SNPs across all chromosomes:", total_snps)

1000G.EUR.1.bed: 489 individuals, 98642 SNPs
1000G.EUR.10.bed: 489 individuals, 64067 SNPs
1000G.EUR.11.bed: 489 individuals, 60977 SNPs
1000G.EUR.12.bed: 489 individuals, 58543 SNPs
1000G.EUR.13.bed: 489 individuals, 45546 SNPs
1000G.EUR.14.bed: 489 individuals, 39484 SNPs
1000G.EUR.15.bed: 489 individuals, 35839 SNPs
1000G.EUR.16.bed: 489 individuals, 36526 SNPs
1000G.EUR.17.bed: 489 individuals, 32218 SNPs
1000G.EUR.18.bed: 489 individuals, 35513 SNPs
1000G.EUR.19.bed: 489 individuals, 22509 SNPs
1000G.EUR.2.bed: 489 individuals, 99735 SNPs
1000G.EUR.20.bed: 489 individuals, 31101 SNPs
1000G.EUR.21.bed: 489 individuals, 17040 SNPs
1000G.EUR.22.bed: 489 individuals, 17489 SNPs
1000G.EUR.3.bed: 489 individuals, 83036 SNPs
1000G.EUR.4.bed: 489 individuals, 74924 SNPs
1000G.EUR.5.bed: 489 individuals, 75164 SNPs
1000G.EUR.6.bed: 489 individuals, 75358 SNPs
1000G.EUR.7.bed: 489 individuals, 66171 SNPs
1000G.EUR.8.bed: 489 individuals, 64975 SNPs
1000G.EUR.9.bed: 489 individuals, 55464 SN

In [7]:
# Define the directory containing your files
data_dir = "../data/LDREF_filtered/"

# Find all bed files in the directory
bed_files = sorted(glob.glob(os.path.join(data_dir, "1000G.EUR.*.bed")))

# Initialize counters
total_individuals = 0
total_snps = 0
individual_count_set = set()  # To ensure consistency across chromosomes

for bed_file in bed_files:
    base_name = bed_file.replace(".bed", "")
    
    # Define corresponding .fam and .bim file paths
    fam_file = base_name + ".fam"
    bim_file = base_name + ".bim"

    # Count individuals from the .fam file (each row represents an individual)
    with open(fam_file, "r") as f:
        num_individuals = sum(1 for _ in f)
    
    # Count SNPs from the .bim file (each row represents a SNP)
    with open(bim_file, "r") as f:
        num_snps = sum(1 for _ in f)
    
    # Store individual counts to verify consistency across chromosomes
    individual_count_set.add(num_individuals)
    
    # Accumulate SNP counts
    total_snps += num_snps

    print(f"{os.path.basename(bed_file)}: {num_individuals} individuals, {num_snps} SNPs")

# Check if the number of individuals is consistent across chromosomes
if len(individual_count_set) > 1:
    print("\nWarning: The number of individuals varies across chromosomes:", individual_count_set)
else:
    total_individuals = individual_count_set.pop()
    print("\nTotal individuals (consistent across chromosomes):", total_individuals)

print("Total SNPs across all chromosomes:", total_snps)

1000G.EUR.1.bed: 343 individuals, 98642 SNPs
1000G.EUR.10.bed: 343 individuals, 64067 SNPs
1000G.EUR.11.bed: 343 individuals, 60977 SNPs
1000G.EUR.12.bed: 343 individuals, 58543 SNPs
1000G.EUR.13.bed: 343 individuals, 45546 SNPs
1000G.EUR.14.bed: 343 individuals, 39484 SNPs
1000G.EUR.15.bed: 343 individuals, 35839 SNPs
1000G.EUR.16.bed: 343 individuals, 36526 SNPs
1000G.EUR.17.bed: 343 individuals, 32218 SNPs
1000G.EUR.18.bed: 343 individuals, 35513 SNPs
1000G.EUR.19.bed: 343 individuals, 22509 SNPs
1000G.EUR.2.bed: 343 individuals, 99735 SNPs
1000G.EUR.20.bed: 343 individuals, 31101 SNPs
1000G.EUR.21.bed: 343 individuals, 17040 SNPs
1000G.EUR.22.bed: 343 individuals, 17489 SNPs
1000G.EUR.3.bed: 343 individuals, 83036 SNPs
1000G.EUR.4.bed: 343 individuals, 74924 SNPs
1000G.EUR.5.bed: 343 individuals, 75164 SNPs
1000G.EUR.6.bed: 343 individuals, 75358 SNPs
1000G.EUR.7.bed: 343 individuals, 66171 SNPs
1000G.EUR.8.bed: 343 individuals, 64975 SNPs
1000G.EUR.9.bed: 343 individuals, 55464 SN

In [8]:
# Define the directory containing your files
data_dir = "../data/LDREF_pruned/"

# Find all bed files in the directory
bed_files = sorted(glob.glob(os.path.join(data_dir, "1000G.EUR.*.bed")))

# Initialize counters
total_individuals = 0
total_snps = 0
individual_count_set = set()  # To ensure consistency across chromosomes

for bed_file in bed_files:
    base_name = bed_file.replace(".bed", "")
    
    # Define corresponding .fam and .bim file paths
    fam_file = base_name + ".fam"
    bim_file = base_name + ".bim"

    # Count individuals from the .fam file (each row represents an individual)
    with open(fam_file, "r") as f:
        num_individuals = sum(1 for _ in f)
    
    # Count SNPs from the .bim file (each row represents a SNP)
    with open(bim_file, "r") as f:
        num_snps = sum(1 for _ in f)
    
    # Store individual counts to verify consistency across chromosomes
    individual_count_set.add(num_individuals)
    
    # Accumulate SNP counts
    total_snps += num_snps

    print(f"{os.path.basename(bed_file)}: {num_individuals} individuals, {num_snps} SNPs")

# Check if the number of individuals is consistent across chromosomes
if len(individual_count_set) > 1:
    print("\nWarning: The number of individuals varies across chromosomes:", individual_count_set)
else:
    total_individuals = individual_count_set.pop()
    print("\nTotal individuals (consistent across chromosomes):", total_individuals)

print("Total SNPs across all chromosomes:", total_snps)

1000G.EUR.1.bed: 343 individuals, 11941 SNPs
1000G.EUR.10.bed: 343 individuals, 7609 SNPs
1000G.EUR.11.bed: 343 individuals, 6965 SNPs
1000G.EUR.12.bed: 343 individuals, 7193 SNPs
1000G.EUR.13.bed: 343 individuals, 5501 SNPs
1000G.EUR.14.bed: 343 individuals, 4904 SNPs
1000G.EUR.15.bed: 343 individuals, 4697 SNPs
1000G.EUR.16.bed: 343 individuals, 5095 SNPs
1000G.EUR.17.bed: 343 individuals, 4756 SNPs
1000G.EUR.18.bed: 343 individuals, 4654 SNPs
1000G.EUR.19.bed: 343 individuals, 3763 SNPs
1000G.EUR.2.bed: 343 individuals, 11154 SNPs
1000G.EUR.20.bed: 343 individuals, 4210 SNPs
1000G.EUR.21.bed: 343 individuals, 2295 SNPs
1000G.EUR.22.bed: 343 individuals, 2647 SNPs
1000G.EUR.3.bed: 343 individuals, 9594 SNPs
1000G.EUR.4.bed: 343 individuals, 8707 SNPs
1000G.EUR.5.bed: 343 individuals, 8954 SNPs
1000G.EUR.6.bed: 343 individuals, 8823 SNPs
1000G.EUR.7.bed: 343 individuals, 7802 SNPs
1000G.EUR.8.bed: 343 individuals, 7212 SNPs
1000G.EUR.9.bed: 343 individuals, 6859 SNPs

Total individual

In [10]:
# Paths to the data
ldref_dir = "../data/LDREF_pruned/"
expression_file = "../data/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt.gz"

# Step 1: Extract individual IDs from the .fam files
fam_files = sorted(glob.glob(os.path.join(ldref_dir, "1000G.EUR.*.fam")))

genotype_individuals = set()
for fam_file in fam_files:
    with open(fam_file, "r") as f:
        for line in f:
            parts = line.strip().split()
            genotype_individuals.add(parts[1])  # Second column is the individual ID

print(f"Total unique individuals in genotype data: {len(genotype_individuals)}")

# Step 2: Extract individual IDs from the gene expression file
with gzip.open(expression_file, "rt") as f:
    header = f.readline().strip().split("\t")  # Read first line (column headers)

expression_individuals = set(header[1:])  # Skip first column, which is "TargetID"

print(f"Total individuals in gene expression data: {len(expression_individuals)}")

# Step 3: Find the intersection of individuals in both datasets
matching_individuals = genotype_individuals.intersection(expression_individuals)

print(f"Number of individuals found in both datasets: {len(matching_individuals)}")

# Step 4: Count number of genes in the gene expression file
with gzip.open(expression_file, "rt") as f:
    num_genes = sum(1 for _ in f) - 1  # Subtract 1 for the header

print(f"Total number of genes with expression data: {num_genes}")

Total unique individuals in genotype data: 343
Total individuals in gene expression data: 465
Number of individuals found in both datasets: 343
Total number of genes with expression data: 23722
