In [1]:
import pandas as pd
import gzip

In [2]:
from collections import defaultdict
mane_gff = '/common/zhangz2lab/shared/genomes/gff/MANE.GRCh38.v1.1.ensembl_genomic.gff.gz'

def read_mane(fp):
    cds_regions_loaded = defaultdict(list)
    with gzip.open(fp, 'rt') as f:  # Open the GFF3 file
        for line in f:
            if not line.startswith('#'):  # Skip header lines
                parts = line.strip().split('\t')
                if not parts[0] in ['chr%i'%i for i in range(1,23)] + ['chrX', 'chrY']:
                    continue
                if parts[2] == 'CDS':  # Focus on CDS entries
                    attributes = parts[8]
                    gene_id = [attr for attr in attributes.split(';') if 'gene_name' in attr][0].split('=')[1]
                    strand = parts[6]
                    start = parts[3]
                    end = parts[4]
                    reading_frame = parts[7]
                    chrom = parts[0]
                    cds_regions_loaded[gene_id].append((int(start), int(end), int(reading_frame), strand, chrom))
                    
    return cds_regions_loaded

cds_regions_loaded = read_mane(mane_gff)

In [3]:
print(cds_regions_loaded.get("PRDM16"))  # Example usage


[(3069260, 3069296, 0, '+', 'chr1'), (3186125, 3186474, 2, '+', 'chr1'), (3244087, 3244137, 0, '+', 'chr1'), (3385152, 3385286, 0, '+', 'chr1'), (3396491, 3396593, 0, '+', 'chr1'), (3402791, 3402998, 2, '+', 'chr1'), (3404739, 3404886, 1, '+', 'chr1'), (3405495, 3405648, 0, '+', 'chr1'), (3411384, 3412800, 2, '+', 'chr1'), (3414560, 3414647, 1, '+', 'chr1'), (3417828, 3417997, 0, '+', 'chr1'), (3418667, 3418744, 1, '+', 'chr1'), (3425581, 3425750, 1, '+', 'chr1'), (3426051, 3426225, 2, '+', 'chr1'), (3430872, 3431108, 1, '+', 'chr1'), (3431966, 3432140, 1, '+', 'chr1'), (3433677, 3433811, 0, '+', 'chr1')]


In [4]:
def parse_string_to_dict(info_string):
    """Convert a VCF INFO field string to a dictionary."""
    info_dict = {}
    for item in info_string.split(';'):
        if '=' in item:
            key, value = item.split('=', 1)
            info_dict[key] = value
    return info_dict

def parse_clinvar_to_dataframe(filepath):
    with gzip.open(filepath, 'rt') as file:
        data = []  # List to store combined information for each variant
        for line in file:
            if line.startswith('#'):
                continue  # Skip header lines
            fields = line.strip().split('\t')
            info_field = fields[7]  # INFO field is the 8th field in VCF format
            info_dict = parse_string_to_dict(info_field)
            
            # Combine first 7 fields with extracted INFO data
            variant_info = fields[:7] + [
                info_dict.get('CLNDN'),
                info_dict.get('CLNDISDB'),
                info_dict.get('CLNSIG'),
                info_dict.get('GENEINFO'),
                info_dict.get('CLNREVSTAT'),
                info_dict.get('CLNHGVS'),
                info_dict.get('CLNDISDBINCL'),
                info_dict.get('CLNVI'),
                info_dict.get('AF_EXAC'),
                info_dict.get('AF_ESP'),
                info_dict.get('AF_TGP'),
                info_dict.get('MC'),
            ]
            
            data.append(variant_info)
        
        # Convert the accumulated data into a pandas DataFrame
        columns = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER',
                   'cln_disease', 'cln_disease_id', 'cln_significance', 'cln_gene',
                   'cln_review', 'cln_hgvs', 'cln_disease_dbs', 'cln_clnvi',
                   'cln_af_exac', 'cln_af_esp', 'cln_af_tgp', 'cln_molecular_consequence']
        df = pd.DataFrame(data, columns=columns)
        
        return df

# Example usage
filepath = '/common/zhangz2lab/fzzhang/knowledge_graphs/wen-chyu/data/variant/clinvar/clinvar.vcf.gz' 
df = parse_clinvar_to_dataframe(filepath)

df = df.dropna(subset=['cln_gene'])
df['gene_name'] = df.apply(lambda row: row['cln_gene'].split(':')[0], axis=1)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,cln_disease,cln_disease_id,cln_significance,cln_gene,cln_review,cln_hgvs,cln_disease_dbs,cln_clnvi,cln_af_exac,cln_af_esp,cln_af_tgp,cln_molecular_consequence,gene_name
0,1,925952,1019397,G,A,.,.,not_provided,MedGen:CN517202,Uncertain_significance,SAMD11:148398,"criteria_provided,_single_submitter",NC_000001.11:g.925952G>A,,,,,,SO:0001583|missense_variant,SAMD11
1,1,925956,1543320,C,T,.,.,not_provided,MedGen:CN517202,Likely_benign,SAMD11:148398,"criteria_provided,_single_submitter",NC_000001.11:g.925956C>T,,,,,,SO:0001819|synonymous_variant,SAMD11
2,1,925969,1648427,C,T,.,.,not_provided,MedGen:CN517202,Likely_benign,SAMD11:148398,"criteria_provided,_single_submitter",NC_000001.11:g.925969C>T,,,,,,SO:0001583|missense_variant,SAMD11
3,1,925976,1362713,T,C,.,.,not_provided,MedGen:CN517202,Uncertain_significance,SAMD11:148398,"criteria_provided,_single_submitter",NC_000001.11:g.925976T>C,,,,,,SO:0001583|missense_variant,SAMD11
4,1,925986,1568423,C,T,.,.,not_provided,MedGen:CN517202,Likely_benign,SAMD11:148398,"criteria_provided,_single_submitter",NC_000001.11:g.925986C>T,,,,,,SO:0001819|synonymous_variant,SAMD11


In [5]:
filtered_df = df[
    (df['cln_disease'].str.contains('cardiomyopathy', case=False, na=False)) & 
     (df['cln_molecular_consequence'].str.contains('missense', case=False, na=False))
]


# Display the filtered DataFrame
filtered_df.head()


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,cln_disease,cln_disease_id,cln_significance,cln_gene,cln_review,cln_hgvs,cln_disease_dbs,cln_clnvi,cln_af_exac,cln_af_esp,cln_af_tgp,cln_molecular_consequence,gene_name
5177,1,3186163,474439,C,T,.,.,Primary_dilated_cardiomyopathy|Left_ventricula...,"EFO:EFO_0000407,Human_Phenotype_Ontology:HP:00...",Uncertain_significance,PRDM16:63976,"criteria_provided,_multiple_submitters,_no_con...",NC_000001.11:g.3186163C>T,,,4e-05,,,SO:0001583|missense_variant,PRDM16
5371,1,3402986,60728,C,T,.,.,Primary_dilated_cardiomyopathy|Left_ventricula...,"EFO:EFO_0000407,Human_Phenotype_Ontology:HP:00...",Uncertain_significance,PRDM16:63976,"criteria_provided,_multiple_submitters,_no_con...",NC_000001.11:g.3402986C>T,,UniProtKB_(protein):Q9HAZ2#VAR_070213|OMIM_All...,3e-05,,,SO:0001583|missense_variant,PRDM16
5451,1,3405572,691817,C,A,.,.,Left_ventricular_noncompaction_cardiomyopathy,"Human_Phenotype_Ontology:HP:0011664,MedGen:C40...",Uncertain_significance,PRDM16:63976,"criteria_provided,_single_submitter",NC_000001.11:g.3405572C>A,,"Klaassen_Lab,Charite_University_Medicine_Berli...",1e-05,,,SO:0001583|missense_variant,PRDM16
5483,1,3411483,977171,G,A,.,.,Primary_dilated_cardiomyopathy|Left_ventricula...,"EFO:EFO_0000407,Human_Phenotype_Ontology:HP:00...",Uncertain_significance,PRDM16:63976,"criteria_provided,_single_submitter",NC_000001.11:g.3411483G>A,,,1e-05,,,SO:0001583|missense_variant,PRDM16
5573,1,3411985,1526272,C,G,.,.,Primary_dilated_cardiomyopathy,"EFO:EFO_0000407,Human_Phenotype_Ontology:HP:00...",Uncertain_significance,PRDM16:63976,no_assertion_criteria_provided,NC_000001.11:g.3411985C>G,,,,,,SO:0001583|missense_variant,PRDM16


In [6]:
new_bed_df = pd.read_csv('output_bed_cm.csv', index_col=0)
# Remove 'chr' prefix from 'chrom' column in new_bed_df
new_bed_df['chrom'] = new_bed_df['chrom'].str.replace('chr', '')
print(new_bed_df)
# Convert 'chrom' columns to strings in both DataFrames if they are not already
filtered_df['CHROM'] = filtered_df['CHROM'].astype(str)
new_bed_df['chrom'] = new_bed_df['chrom'].astype(str)

# Ensure 'POS' in filtered_df and 'end' in new_bed_df are integers
filtered_df['POS'] = filtered_df['POS'].astype(int)
new_bed_df['end'] = new_bed_df['end'].astype(int)

# Attempt the merge again
matches = filtered_df.merge(new_bed_df, left_on=['CHROM', 'POS'], right_on=['chrom', 'end'])
print(matches)
print(matches['REF'])

# Perform the merge to identify rows to keep ("left_only") and use suffixes to handle overlapping column names
# We'll use suffixes that are unlikely to cause conflicts and are easy to filter out
merged_df = pd.merge(filtered_df, matches, on=['CHROM', 'POS'], how='left', indicator=True, suffixes=('', '_drop'))

# Filter to keep only rows from filtered_df that don't have matches in new_bed_df
filtered_df_no_matches = merged_df[merged_df['_merge'] == 'left_only']

# Now, remove columns that were added from 'matches' (those with '_drop' suffix) and the '_merge' column
# This is done by filtering out columns that end with '_drop' and dropping the '_merge' column
filtered_df_no_matches = filtered_df_no_matches.loc[:, ~filtered_df_no_matches.columns.str.endswith('_drop')].drop(columns=['_merge'])


filtered_df = filtered_df_no_matches 
# Now, filtered_df_no_matches should have the original structure without "_x" or "_y" suffixes
print(filtered_df_no_matches.head())


    chrom      start        end         original_position  score
0       X  120441764  120441765  chrX:119575620-119575620      1
1       X  120442598  120442599  chrX:119576454-119576454      1
2       X  101398032  101398033  chrX:100653021-100653021      1
3       X  101398941  101398942  chrX:100653930-100653930      1
4       X  101398894  101398895  chrX:100653883-100653883      1
..    ...        ...        ...                       ...    ...
435     3   38603947   38603948    chr3:38645439-38645439      1
436     3   38581234   38581235    chr3:38622726-38622726      1
437    11   47343050   47343051   chr11:47364602-47364602      1
438     3   38580975   38580976    chr3:38622467-38622467      1
439     3   38613771   38613772    chr3:38655263-38655263      1

[440 rows x 5 columns]
    CHROM        POS      ID REF ALT QUAL FILTER  \
0       1  156130708   48067   A   C    .      .   
1       1  156130708  222692   A   G    .      .   
2       1  156130741   14504   G   A    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['CHROM'] = filtered_df['CHROM'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['POS'] = filtered_df['POS'].astype(int)


In [7]:
# Filter the DataFrame to include only rows with 'cln_significance' containing "benign" or "pathogenic"
filtered_df = filtered_df[filtered_df['cln_significance'].str.contains('benign|pathogenic', case=False)]

# Filter out conflicting annotations
filtered_df = filtered_df[~filtered_df['cln_significance'].str.contains('conflicting', case=False)]


# Count rows where 'cln_significance' contains "benign"
benign_count = filtered_df[filtered_df['cln_significance'].str.contains('benign', case=False)].shape[0]

# Assuming you want to count rows where 'cln_significance' contains "pathogenic" (regardless of 'cln_molecular_consequence')
pathogenic_count = filtered_df[filtered_df['cln_significance'].str.contains('pathogenic', case=False)].shape[0]

print(f"Number of 'benign' rows: {benign_count}")
print(f"Number of 'pathogenic' rows: {pathogenic_count}")


Number of 'benign' rows: 1150
Number of 'pathogenic' rows: 530


In [8]:
# Get unique CHROM values from the DataFrame
unique_chrom_values = filtered_df['CHROM'].unique()

# Print all unique CHROM values
print("Unique CHROM values in the DataFrame:")
for chrom in unique_chrom_values:
    print(chrom)

# Count the number of unique CHROM values
number_of_unique_chrom_values = len(unique_chrom_values)
print(f"Total number of unique CHROM values: {number_of_unique_chrom_values}")
filtered_df = filtered_df[~filtered_df['CHROM'].isin(['MT', 'NW_009646201.1'])]

Unique CHROM values in the DataFrame:
1
2
3
4
5
6
7
9
10
11
12
14
15
16
17
18
19
20
21
22
X
Total number of unique CHROM values: 21


In [9]:
from Bio.Seq import Seq

# Function to translate DNA sequence to protein sequence
def translate_dna_to_protein(dna_sequence):
    dna_seq = Seq(dna_sequence)
    protein_seq = dna_seq.translate(to_stop=True)  # to_stop=True will stop at the first stop codon
    return str(protein_seq)

In [10]:
# Get proteins for each gene
from pyfaidx import Fasta
from Bio.Seq import Seq
from tqdm import tqdm

reference_genome = Fasta('/common/zhangz2lab/zhanh/Clinvar/hg38.fa')

def reverse_complement(seq):
    complement = {'A': 'T', 'T': 'A', 'G': 'C', 'C': 'G'}
    return ''.join(complement.get(base, base) for base in reversed(seq))


codon_table = {
    'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
    'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
    'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
    'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
    'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
    'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
    'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
    'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
    'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
    'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
    'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
    'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
    'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
    'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
    'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
    'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}

def translate_dna(sequence):
    protein = ""
    for i in range(0, len(sequence), 3):
        codon = sequence[i:i+3]
        if len(codon) == 3:
            protein += codon_table.get(codon, 'X')  # 'X' for unknown codons
    return protein


In [11]:
gene = 'CASQ2'
dna = ''

for cds_start, cds_end, _, strand, chromosome in cds_regions_loaded[gene]:
    if strand == '+':
        dna += reference_genome[chromosome][cds_start-1:cds_end].seq
    else:
        dna += reference_genome[chromosome][cds_start-1:cds_end].reverse.complement.seq

# why different? Biopython's translation matches UCSC genome browser
strand, translate_dna(dna), translate_dna_to_protein(dna)

('-',
 'MKRTHLFIVGIYFLSSCRAEEGLNFPTYDGKDRVVSLSEKNFKQVLKKYDLLCLYYHEPVSSDKVTQKQFQLKEIVLELVAQVLEHKAIGFVMVDAKKEAKLAKKLGFDEEGSLYILKGDRTIEFDGEFAADVLVEFLLDLIEDPVEIISSKLEVQAFERIEDYIKLIGFFKSEDSEYYKAFEEAAEHFQPYIKFFATFDKGVAKKLSLKMNEVDFYEPFMDEPIAIPNKPYTEEELVEFVKEHQRPTLRRLRPEEMFETWEDDLNGIHIVAFAEKSDPDGYEFLEILKQVARDNTDNPDLSILWIDPDDFPLLVAYWEKTFKIDLFRPQIGVVNVTDADSVWMEIPDDDDLPTAEELEDWIEDVLSGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX_',
 'MKRTHLFIVGIYFLSSCRAEEGLNFPTYDGKDRVVSLSEKNFKQVLKKYDLLCLYYHEPVSSDKVTQKQFQLKEIVLELVAQVLEHKAIGFVMVDAKKEAKLAKKLGFDEEGSLYILKGDRTIEFDGEFAADVLVEFLLDLIEDPVEIISSKLEVQAFERIEDYIKLIGFFKSEDSEYYKAFEEAAEHFQPYIKFFATFDKGVAKKLSLKMNEVDFYEPFMDEPIAIPNKPYTEEELVEFVKEHQRPTLRRLRPEEMFETWEDDLNGIHIVAFAEKSDPDGYEFLEILKQVARDNTDNPDLSILWIDPDDFPLLVAYWEKTFKIDLFRPQIGVVNVTDADSVWMEIPDDDDLPTAEELEDWIEDVLSGKINTEDDDEDDDDDDNSDEEDNDDSDDDDDE')

In [12]:
flank_amount = 500

sequences = []

for index, variant in filtered_df.iterrows():
    gene_name = variant['gene_name']
    chromosome = "chr" + str(variant['CHROM'])
    position = int(variant['POS'])
    ref_allele = variant['REF']
    alt_allele = variant['ALT']
    label = variant['cln_significance']

    cds_list = cds_regions_loaded[gene_name]  # Get the list of CDS regions for the gene
    cds_list_sorted = sorted(cds_list, key=lambda x: x[0])  # Ensure the CDS regions are sorted


    dna_pos = 0
    is_within_cds = False
    # Find the index of the CDS region that contains the variant
    # cds_regions_loaded is sorted ascending for + strand, and descending for - strand; no need to re-sort
    for i, cds_interval in enumerate(cds_regions_loaded[gene_name]):
        cds_start = cds_interval[0]; cds_end = cds_interval[1]; strand = cds_interval[-2]
        #print(gene_name, dna_pos, position, cds_interval)
        if cds_start <= position <= cds_end:
            if strand == '+':
                dna_pos += position - cds_start
            else:
                dna_pos += cds_end - position
            is_within_cds = True
            break
        dna_pos += cds_end - cds_start + 1
    
    prot_pos = dna_pos//3

    if not is_within_cds:
        continue

    dna = ''
    for cds_start, cds_end, _, strand, chromosome in cds_regions_loaded[gene_name]:
        if strand == '+':
            dna += reference_genome[chromosome][cds_start-1:cds_end].seq
        else:
            dna += reference_genome[chromosome][cds_start-1:cds_end].reverse.complement.seq

    ref_allele_len = len(ref_allele)
    if strand == "+":
        is_ref_match = int(dna[dna_pos:(dna_pos+ref_allele_len)] == ref_allele)
        ref_dna = dna[:dna_pos] + ref_allele + dna[dna_pos+ref_allele_len:]
        alt_dna = dna[:dna_pos] + alt_allele + dna[dna_pos+ref_allele_len:]
    else:
        ref_allele = reverse_complement(ref_allele)
        alt_allele = reverse_complement(alt_allele)
        is_ref_match = int(dna[dna_pos:(dna_pos+ref_allele_len)] == ref_allele)
        ref_dna = dna[:dna_pos] + ref_allele + dna[dna_pos+ref_allele_len:]
        alt_dna = dna[:dna_pos] + alt_allele + dna[dna_pos+ref_allele_len:]

    ref_prot = translate_dna_to_protein(ref_dna)
    alt_prot = translate_dna_to_protein(alt_dna)
    ref_prot_ = ref_prot[max(0, prot_pos-flank_amount):min(len(ref_prot), prot_pos+flank_amount)]
    alt_prot_ = alt_prot[max(0, prot_pos-flank_amount):min(len(alt_prot), prot_pos+flank_amount)]
    ref_dna_ = ref_dna[max(0, dna_pos-flank_amount*3):min(len(ref_dna), dna_pos+flank_amount*3)]
    alt_dna_ = alt_dna[max(0, dna_pos-flank_amount*3):min(len(alt_dna), dna_pos+flank_amount*3)]

    sequences.append({
        'ref_prot': ref_prot_,
        'alt_prot': alt_prot_,
        'ref_dna': ref_dna_,
        'alt_dna': alt_dna_,
        'strand': strand,
        'label': label,
        'gene': gene_name,
        'ref': ref_allele,
        'alt': alt_allele,
        'POS': position,
        'is_ref_match': is_ref_match,
        'molecular_consequences': variant['cln_molecular_consequence']
    })

sequences_df = pd.DataFrame(sequences)



In [38]:
sequences_df.strand.value_counts()
sequences_df['gene']
# Assuming sequences_df is your DataFrame and 'gene' is the column you want to save
sequences_df['gene'].to_csv('clinvar_cm.csv', index=False)
df = pd.read_csv("/common/zhangz2lab/zhanh/Propath-2/PLLR_CM.csv")
# For the DataFrame loaded from CSV

# For the sequences_df DataFrame

#df['gene'] = sequences_df['gene']
#df['label'] = sequences_df['label']
#df.to_csv('/common/zhangz2lab/zhanh/Propath-2/PLLR_CM.csv', index=False)

Unnamed: 0,ref_prot,alt_prot,ref_dna,alt_dna,strand,label,gene,ref,alt,POS,is_ref_match,molecular_consequences,labels
0,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,TCCACAGCACGGTGAAGCCTTTCATATGTGAGGTCTGCCACAAGTC...,TCCACAGCACGGTGAAGCCTTTCATATGTGAGGTCTGCCACAAGTC...,+,Pathogenic,PRDM16,T,C,3414616,1,SO:0001583|missense_variant,1
1,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,GGGAACCCAGCCCTGCCCCTGGTCTCCGCCGTCAGCAACAGCAGCC...,GGGAACCCAGCCCTGCCCCTGGTCTCCGCCGTCAGCAACAGCAGCC...,+,Likely_benign,PRDM16,G,A,3426095,1,SO:0001583|missense_variant,0
2,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,GGCAGTGACTTTGAGGACGTCAACACCACCACGGGGACCGACCTGG...,GGCAGTGACTTTGAGGACGTCAACACCACCACGGGGACCGACCTGG...,+,Benign,PRDM16,G,A,3430888,1,SO:0001583|missense_variant,0
3,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDETPAIRNHDKVLQRLC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,+,Benign,PLEKHM2,T,C,15716271,1,SO:0001583|missense_variant,0
4,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,+,Benign,PLEKHM2,G,A,15718548,1,SO:0001583|missense_variant,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Pathogenic/Likely_pathogenic,TAFAZZIN,G,A,154413544,1,"SO:0001583|missense_variant,SO:0001619|non-cod...",1
1641,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Likely_pathogenic,TAFAZZIN,G,A,154413552,1,"SO:0001583|missense_variant,SO:0001619|non-cod...",1
1642,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Benign/Likely_benign,TAFAZZIN,T,C,154414113,0,"SO:0001583|missense_variant,SO:0001627|intron_...",0
1643,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Likely_pathogenic,TAFAZZIN,G,T,154420212,1,"SO:0001583|missense_variant,SO:0001619|non-cod...",1


In [14]:
# no ref_allele mismatch
sequences_df[sequences_df.is_ref_match==0]

Unnamed: 0,ref_prot,alt_prot,ref_dna,alt_dna,strand,label,gene,ref,alt,POS,is_ref_match,molecular_consequences
28,MNDISQKAEILLSSSKPVPKTYVPKLGKGDVKDKFEAMQRAREERN...,MNDISQKAEILLSSSKPVPKTYVPKLGKGDVKDKFEAMQRAREERN...,ATGAATGATATTTCCCAAAAGGCTGAGATTCTGCTTTCTTCATCTA...,ATGAATGATATTTCCCAAAAGGCTGAGATTCTGCTTTCTTCATCTA...,+,Likely_benign,NEXN,G,A,77926854,0,SO:0001583|missense_variant
30,MNDISQKAEILLSSSKPVPKTYVPKLGKGDVKDKFEAMQRAREERN...,MNDISQKAEILLSSSKPVPKTYVPKLGKGDVKDKFEAMQRAREERN...,ATGAATGATATTTCCCAAAAGGCTGAGATTCTGCTTTCTTCATCTA...,ATGAATGATATTTCCCAAAAGGCTGAGATTCTGCTTTCTTCATCTA...,+,Benign,NEXN,A,C,77929446,0,SO:0001583|missense_variant
32,KGDVKDKFEAMQRAREERNQRRSRDEKQRRKEQYIREREWNRRKQE...,KGDVKDKFEAMQRAREERNQRRSRDEKQRRKEQYIREREWNRRKQE...,AAGGGTGATGTAAAGGATAAGTTTGAAGCCATGCAGAGAGCCAGGG...,AAGGGTGATGTAAAGGATAAGTTTGAAGCCATGCAGAGAGCCAGGG...,+,Likely_benign,NEXN,G,C,77942131,0,SO:0001583|missense_variant
101,MSDIEEVVEEYEEEEQEEAAVEEEEDWREDEDEQEEAAEEDAEAEA...,MSDIEEVVEEYEEEEQEEAAVEEEEDWREDEDEQEEAAEEDAEAEA...,ATGTCTGACATAGAAGAGGTGGTGGAAGAGTACGAGGAGGAGGAGC...,ATGTCTGACATAGAAGAGGTGGTGGAAGAGTACGAGGAGGAGGAGC...,-,Likely_pathogenic,TNNT2,TC,GT,201365247,0,"SO:0001583|missense_variant,SO:0001627|intron_..."
168,TVSSDSVAKFAVKATGEPRPTAIWTKDGKAITQGGKYKLSEDKGGF...,TVSSDSVAKFAVKATGEPRPTAIWTKDGKAITQGGKYKLSEDKGGF...,TGTTTCTTCAGACAGTGTTGCTAAATTTGCAGTTAAGGCTACTGGA...,TGTTTCTTCAGACAGTGTTGCTAAATTTGCAGTTAAGGCTACTGGA...,-,Pathogenic,TTN,AAGTAACATGG,TGAAAGAAAAA,178527198,0,SO:0001583|missense_variant
174,RIMAEREDEELLRPVTTTQHLSEYKSELDFMSKEEKSRKKSRRQRE...,RIMAEREDEELLRPVTTTQHLSEYKSELDFMSKEEKSRKKSRRQRE...,GAATCATGGCTGAGAGGGAGGATGAAGAGTTGCTTCGCCCAGTTAC...,GAATCATGGCTGAGAGGGAGGATGAAGAGTTGCTTCGCCCAGTTAC...,-,Benign/Likely_benign,TTN,GC,TT,178530827,0,SO:0001583|missense_variant
186,SDVSRDSVNLTWTEPASDGGSKITNYIVEKCATTAERWLRVGQARE...,SDVSRDSVNLTWTEPASDGGSKITNYIVEKCATTAERWLRVGQARE...,AGTGATGTCTCACGAGATTCTGTCAACTTAACATGGACTGAGCCAG...,AGTGATGTCTCACGAGATTCTGTCAACTTAACATGGACTGAGCCAG...,-,Likely_benign,TTN,CA,TG,178534020,0,SO:0001583|missense_variant
254,DHRYEFRVIARNAAGVFSEPSESTGAITARDEVDPPRISMDPKYKD...,DHRYEFRVIARNAAGVFSEPSESTGAITARDEVDPPRISMDPKYKD...,GATCACAGATATGAGTTCCGGGTTATAGCCCGAAATGCCGCAGGAG...,GATCACAGATATGAGTTCCGGGTTATAGCCCGAAATGCCGCAGGAG...,-,Benign/Likely_benign,TTN,T,C,178569412,0,SO:0001583|missense_variant
557,MEHIQGAWKTISNGFGFKDAVFDGSSCISPTIVQQFGYQRRGSDDG...,MEHIQGAWKTISNGFGFKDAVFDGSSCISPTIVQQFGYQRRDSDDG...,ATGGAGCACATACAGGGAGCTTGGAAGACGATCAGCAATGGTTTTG...,ATGGAGCACATACAGGGAGCTTGGAAGACGATCAGCAATGGTTTTG...,-,Likely_benign,RAF1,GC,AT,12618597,0,"SO:0001583|missense_variant,SO:0001619|non-cod..."
599,MAAAAAAAAEQQSSNGPVKKSMREKAVERRSVNKEHNSNFKAGYIP...,VAAAAAAAAEQQSSNGPVKKSMREKAVERRSVNKEHNSNFKAGYIP...,AtggcggcagcggcggcggcggctgcAGAACAGCAAAGTTCCAATG...,GtggcggcagcggcggcggcggctgcAGAACAGCAAAGTTCCAATG...,-,Pathogenic,SGCB,A,G,52038259,0,"SO:0001582|initiatior_codon_variant,SO:0001583..."


In [15]:
# ref_prot != alt_prot unless other molecular_consequences exist
sequences_df[sequences_df.ref_prot==sequences_df.alt_prot][['label','molecular_consequences']]

Unnamed: 0,label,molecular_consequences
463,Likely_benign,"SO:0001583|missense_variant,SO:0001627|intron_..."
464,Benign/Likely_benign,"SO:0001583|missense_variant,SO:0001627|intron_..."
465,Benign/Likely_benign,"SO:0001583|missense_variant,SO:0001627|intron_..."
466,Benign/Likely_benign,"SO:0001583|missense_variant,SO:0001627|intron_..."
467,Likely_benign,"SO:0001583|missense_variant,SO:0001627|intron_..."
723,Likely_pathogenic,SO:0001583|missense_variant
825,Benign,"SO:0001583|missense_variant,SO:0001819|synonym..."
826,Likely_benign,"SO:0001583|missense_variant,SO:0001819|synonym..."
827,Likely_benign,"SO:0001583|missense_variant,SO:0001819|synonym..."
828,Likely_benign,"SO:0001583|missense_variant,SO:0001819|synonym..."


In [27]:
# save
sequences_df.to_csv('/common/zhangz2lab/zhanh/esm-variants/cropped/Clinvar_CM_protein.csv', index=False)

Unnamed: 0,ref_prot,alt_prot,ref_dna,alt_dna,strand,label,gene,ref,alt,POS,is_ref_match,molecular_consequences,labels
0,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,TCCACAGCACGGTGAAGCCTTTCATATGTGAGGTCTGCCACAAGTC...,TCCACAGCACGGTGAAGCCTTTCATATGTGAGGTCTGCCACAAGTC...,+,Pathogenic,PRDM16,T,C,3414616,1,SO:0001583|missense_variant,1
1,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,GGGAACCCAGCCCTGCCCCTGGTCTCCGCCGTCAGCAACAGCAGCC...,GGGAACCCAGCCCTGCCCCTGGTCTCCGCCGTCAGCAACAGCAGCC...,+,Likely_benign,PRDM16,G,A,3426095,1,SO:0001583|missense_variant,0
2,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,GGCAGTGACTTTGAGGACGTCAACACCACCACGGGGACCGACCTGG...,GGCAGTGACTTTGAGGACGTCAACACCACCACGGGGACCGACCTGG...,+,Benign,PRDM16,G,A,3430888,1,SO:0001583|missense_variant,0
3,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDETPAIRNHDKVLQRLC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,+,Benign,PLEKHM2,T,C,15716271,1,SO:0001583|missense_variant,0
4,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,+,Benign,PLEKHM2,G,A,15718548,1,SO:0001583|missense_variant,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Pathogenic/Likely_pathogenic,TAFAZZIN,G,A,154413544,1,"SO:0001583|missense_variant,SO:0001619|non-cod...",1
1641,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Likely_pathogenic,TAFAZZIN,G,A,154413552,1,"SO:0001583|missense_variant,SO:0001619|non-cod...",1
1642,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Benign/Likely_benign,TAFAZZIN,T,C,154414113,0,"SO:0001583|missense_variant,SO:0001627|intron_...",0
1643,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Likely_pathogenic,TAFAZZIN,G,T,154420212,1,"SO:0001583|missense_variant,SO:0001619|non-cod...",1


In [28]:
print(sequences_df['ref_dna'].iloc[0])
print(sequences_df['alt_dna'].iloc[0])

TCCACAGCACGGTGAAGCCTTTCATATGTGAGGTCTGCCACAAGTCCTACACGCAGTTCTCCAACCTGTGCCGGCACAAGCGGATGCACGCCGACTGCCGCACGCAGATCAAGTGCAAGGACTGTGGCCAGATGTTCAGCACTACCTCCTCCCTCAACAAGCACCGGCGCTTCTGCGAGGGCAAGAACCATTACACGCCGGGCGGCATCTTTGCCCCGGGCCTGCCCTTGACCCCCAGCCCCATGATGGACAAGGCAAAACCCTCCCCCAGCCTCAATCACGCCAGCCTGGGCTTCAACGAGTACTTTCCCTCCAGGCCGCACCCGGGGAGCCTGCCCTTCTCCACGGCGCCTCCCACGTTCCCCGCACTCACCCCCGGCTTCCCGGGCATCTTCCCTCCATCCTTGTACCCCCGGCCGCCTCTGCTACCTCCCACATCGCTGCTCAAGAGCCCCCTGAACCACACCCAGGACGCCAAGCTCCCCAGTCCCCTGGGGAACCCAGCCCTGCCCCTGGTCTCCGCCGTCAGCAACAGCAGCCAGGGCACGACGGCAGCTGCGGGGCCCGAGGAGAAGTTCGAGAGCCGCCTGGAGGACTCCTGTGTGGAGAAGCTGAAGACCAGGAGCAGCGACATGTCGGACGGCAGTGACTTTGAGGACGTCAACACCACCACGGGGACCGACCTGGACACGACCacggggacgggctcggacctggacagcgacgtggacagcgaccctgacaaggacaagggcaagggcaAGTCCGCCGAGGGCCAGCCCAAGTTTGGGGGCGGCTTGGCGCCCCCGGGGGCCCCGAACAGCGTGGCCGAGGTGCCTGTCTTCTATTCCCAGCACTCATTCTTCCCGCCACCCGACGAGCAGCTGCTGACTGCAACGGGCGCCGCCGGGGACTCCATCAAGGCCATCGCATCCATTGCCGAGAAGTACTTTGGCCCCGGCTTCATGGGGATGCAGGAGAAGAAGCTGGGCTCGCTCCC

In [29]:
sequences_df['ref_prot'].iloc[0] == sequences_df['alt_prot'].iloc[0]

False

In [30]:
sequences_df

Unnamed: 0,ref_prot,alt_prot,ref_dna,alt_dna,strand,label,gene,ref,alt,POS,is_ref_match,molecular_consequences,labels
0,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,TCCACAGCACGGTGAAGCCTTTCATATGTGAGGTCTGCCACAAGTC...,TCCACAGCACGGTGAAGCCTTTCATATGTGAGGTCTGCCACAAGTC...,+,Pathogenic,PRDM16,T,C,3414616,1,SO:0001583|missense_variant,1
1,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,GGGAACCCAGCCCTGCCCCTGGTCTCCGCCGTCAGCAACAGCAGCC...,GGGAACCCAGCCCTGCCCCTGGTCTCCGCCGTCAGCAACAGCAGCC...,+,Likely_benign,PRDM16,G,A,3426095,1,SO:0001583|missense_variant,0
2,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,GGCAGTGACTTTGAGGACGTCAACACCACCACGGGGACCGACCTGG...,GGCAGTGACTTTGAGGACGTCAACACCACCACGGGGACCGACCTGG...,+,Benign,PRDM16,G,A,3430888,1,SO:0001583|missense_variant,0
3,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDETPAIRNHDKVLQRLC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,+,Benign,PLEKHM2,T,C,15716271,1,SO:0001583|missense_variant,0
4,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,ATGGAGCCGGGGGAGGTGAAGGACCGGATCCTGGAGAACATCTCGC...,+,Benign,PLEKHM2,G,A,15718548,1,SO:0001583|missense_variant,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1640,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Pathogenic/Likely_pathogenic,TAFAZZIN,G,A,154413544,1,"SO:0001583|missense_variant,SO:0001619|non-cod...",1
1641,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Likely_pathogenic,TAFAZZIN,G,A,154413552,1,"SO:0001583|missense_variant,SO:0001619|non-cod...",1
1642,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Benign/Likely_benign,TAFAZZIN,T,C,154414113,0,"SO:0001583|missense_variant,SO:0001627|intron_...",0
1643,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,ATGCCTCTGCACGTGAAGTGGCCGTTCCCCGCGGTGCCGCCGCTCA...,+,Likely_pathogenic,TAFAZZIN,G,T,154420212,1,"SO:0001583|missense_variant,SO:0001619|non-cod...",1


In [41]:
# Adjust the mapping to account for variations in the 'cln_significance' descriptions
def map_significance(value):
    value = value.lower()  # Convert to lower case to handle case variations uniformly
    if 'benign' in value:  # This will match "Benign" and "Likely_benign"
        return 0
    else:  # Matches "Pathogenic"
        return 1
#sequences_df_rc['seq_a'] = sequences_df_rc['seq_a'].str.upper()
#sequences_df_rc['seq_b'] = sequences_df_rc['seq_b'].str.upper()
# Apply the mapping function to the 'cln_significance' column
sequences_df['labels'] = sequences_df['label'].apply(map_significance)
#sequences_df_rc['POS'] = sequences_df['POS']
#sequences_df_reset = sequences_df_rc.reset_index(drop=True)
print(sequences_df)
# Now sequences_df_reset contains the appended 'cln_significance_mapped' column
sequences_df_rc = sequences_df[['ref_prot','alt_prot','labels']]
sequences_df_rc_gene = sequences_df[['ref_prot','alt_prot','labels','gene','label']]
sequences_df_rc = sequences_df_rc.rename(columns={'ref_prot': 'wt_seq', 'alt_prot': 'mut_seq'})

                                               ref_prot  \
0     IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...   
1     GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...   
2     GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...   
3     MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...   
4     MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...   
...                                                 ...   
1640  MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...   
1641  MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...   
1642  MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...   
1643  MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...   
1644  MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...   

                                               alt_prot  \
0     IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...   
1     GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...   
2     GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...   
3     MEPGEVKDRILENISLSVKKLQSYFAACEDETPAIRNHDKVLQRLC...

In [39]:
sequences_df_rc.to_csv('/common/zhangz2lab/zhanh/esm-variants/cropped/Clinvar_CM_protein.csv', index=False)

Unnamed: 0,wt_seq,mut_seq,labels
0,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,1
1,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,0
2,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,0
3,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDETPAIRNHDKVLQRLC...,0
4,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,0
...,...,...,...
1640,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,1
1641,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,1
1642,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,0
1643,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,1


In [43]:
sequences_df_rc = sequences_df_rc[sequences_df_rc['mut_seq'] != '']
sequences_df_rc = sequences_df_rc[sequences_df_rc['wt_seq'] != '']
sequences_df_rc_gene = sequences_df_rc_gene[sequences_df_rc_gene['ref_prot'] != '']
sequences_df_rc_gene = sequences_df_rc_gene[sequences_df_rc_gene['alt_prot'] != '']

In [34]:
sequences_df_reset = sequences_df_rc.reset_index(drop=True)
sequences_df_reset

Unnamed: 0,wt_seq,mut_seq,labels
0,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,1
1,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,0
2,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,0
3,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDETPAIRNHDKVLQRLC...,0
4,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,0
...,...,...,...
1630,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,1
1631,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,1
1632,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,0
1633,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,1


In [35]:
sequences_df_rc.to_csv('/common/zhangz2lab/zhanh/esm-variants/cropped/Clinvar_CM_protein.csv', index=False)

In [44]:
sequences_df_rc_gene

Unnamed: 0,ref_prot,alt_prot,labels,gene,label
0,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,IHSTVKPFICEVCHKSYTQFSNLCRHKRMHADCRTQIKCKDCGQMF...,1,PRDM16,Pathogenic
1,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,GNPALPLVSAVSNSSQGTTAAAGPEEKFESRLEDSCVEKLKTRSSD...,0,PRDM16,Likely_benign
2,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,GSDFEDVNTTTGTDLDTTTGTGSDLDSDVDSDPDKDKGKGKSAEGQ...,0,PRDM16,Benign
3,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDETPAIRNHDKVLQRLC...,0,PLEKHM2,Benign
4,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,MEPGEVKDRILENISLSVKKLQSYFAACEDEIPAIRNHDKVLQRLC...,0,PLEKHM2,Benign
...,...,...,...,...,...
1640,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,1,TAFAZZIN,Pathogenic/Likely_pathogenic
1641,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,1,TAFAZZIN,Likely_pathogenic
1642,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,0,TAFAZZIN,Benign/Likely_benign
1643,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,MPLHVKWPFPAVPPLTWTLASSVVMGLVGTYSCFWTKYMNHLTVHN...,1,TAFAZZIN,Likely_pathogenic


In [59]:
df = pd.read_csv("/common/zhangz2lab/zhanh/Propath-2/PLLR_CM.csv")
# For the DataFrame loaded from CSV

# For the sequences_df DataFrame
sequences_df_rc_gene.reset_index(drop=True, inplace=True)
df['gene'] = sequences_df_rc_gene['gene']
df['label'] = sequences_df_rc_gene['label']
df.to_csv('/common/zhangz2lab/zhanh/Propath-2/PLLR_CM.csv', index=False)

In [5]:
import pandas as pd
df = pd.read_csv('/common/zhangz2lab/zhanh/Propath-2/PLLR_CM.csv')
value_counts = df['true_labels_callback'].value_counts()

# Print the counts
print(value_counts)

0    1113
1     522
Name: true_labels_callback, dtype: int64


0         PRDM16
1         PRDM16
2         PRDM16
3        PLEKHM2
4        PLEKHM2
          ...   
1630    TAFAZZIN
1631    TAFAZZIN
1632    TAFAZZIN
1633    TAFAZZIN
1634    TAFAZZIN
Name: gene, Length: 1635, dtype: object