In [1]:
import pandas as pd
import gzip

def read_tsv(tsv_file):
    return pd.read_csv(tsv_file, sep="\t", compression="gzip")

def read_vcf(vcf_file):
    with gzip.open(vcf_file, 'rt') as f:  # 'rt' for reading text from a gzip file
        lines = [line for line in f if not line.startswith('#')]
    return pd.DataFrame([line.split('\t') for line in lines])

def compare_snps(tsv_df, vcf_df):
    # Assuming the SNP IDs are in the first column of both dataframes
    matched_snps = vcf_df[vcf_df[0].isin(tsv_df['ID'])]
    return matched_snps

# Read the files
tsv_df = read_tsv('/scratch/tweber/DATA/MC_DATA/DEMULTIPLEXING_POOLS/BCFTOOLS_CONCAT_TAB/LanexHGSVCpool2NEW/merge.txt.gz')
vcf_df = read_vcf('/scratch/tweber/DATA/MC_DATA/DEMULTIPLEXING_POOLS/GENOTYPING_OTF/LanexHGSVCpool2NEW/LanexHGSVCpool2NEWiTRU1A68.vcf.gz')


print(tsv_df)
print(vcf_df)

# Compare and get the matched SNPs
matched_snps = compare_snps(tsv_df, vcf_df)

# Output the results
print(matched_snps)


                       ID   AC        AF   SAMPLE
0            10:49230:C:T   21  0.003279  HG00238
1            10:49248:G:A   17  0.002655  HG00238
2            10:49257:A:G   18  0.002811  HG00238
3            10:49264:G:C   17  0.002655  HG00238
4            10:49269:A:G   17  0.002655  HG00238
...                   ...  ...       ...      ...
16837406  X:156021673:C:T    7  0.001093  NA21126
16837407  X:156021873:C:T  296  0.046221  NA21126
16837408  X:156023995:G:A    6  0.000937  NA21126
16837409  X:156028529:C:T   80  0.012492  NA21126
16837410  X:156029799:A:G   16  0.002498  NA21126

[16837411 rows x 4 columns]
                     0        1  2  3  4            5  6  \
0                 chr1   270684  .  C  G      52.2575  .   
1                 chr1   791101  .  T  G      41.5715  .   
2                 chr1   803830  .  A  G     0.906968  .   
3                 chr1  1037956  .  A  G      56.3287  .   
4                 chr1  1277415  .  C  A      52.5386  .   
...        