In [6]:
import os

# os.chdir("/scratch/project/tcr_neoantigen/misc/jaz/scripts")

from pathlib import Path
from pyfaidx import Fasta
from tqdm import tqdm

from _utils import (
    find,
    read_and_filter,
    create_result_list,
    extract_result,
    extract_exon_info,
    filter_exon_pos,
    get_sequences_indel,
    get_sequences_substitution,
    complementary_sequence,
    reverse_complement,
    flanking_lower_positions,
    # print_windows,
)

# define head paths
HG38FOLDER = Path("/Users/Ali/Desktop/Long Covid/minigene_design/Reference/GRCh38.d1.vd1/fasta")
INPUTFOLDER = Path("/Users/Ali/Desktop/Long Covid/minigene_design/input")

In [7]:
# import reference and data
fasta_file = HG38FOLDER / "GRCh38.d1.vd1.fa"
gtf_file = HG38FOLDER / "gencode.v33.primary_assembly.annotation.gtf"

# Ensure the path exists
if not fasta_file.exists():
    raise FileNotFoundError(f"The specified FASTA file was not found: {fasta_file}")

if not gtf_file.exists():
    raise FileNotFoundError(f"The specified GTF file was not found: {gtf_file}")

refgen = Fasta(filename=fasta_file)
exon_info = extract_exon_info(gtf_file)

In [8]:
samples = [
    "2020135",
    "2020239_WO1",
    "2020246_NO1",
    "2020260_WO1",
    "2020281_WO1",
    "2021111_MO1",
    "DES001",
    "DES002",
    "DES002_001",
    "DES002_002",
    "DES010",
]

In [9]:
results = {}
for sample in samples:
    file_input_path = (
        INPUTFOLDER
        / sample
        / "analyses"
        / sample
        / "05_vep"
        / "tables"
        / "high_confidence"
        / f"{sample}_hc_vep.txt"
    )
    results[sample] = read_and_filter(file_input_path)

for sample in samples:
    print(sample, results[sample].shape)

for sample in samples:
    print(sample, results[sample].VARIANT_CLASS.unique())

FileNotFoundError: [Errno 2] No such file or directory: '/Users/Ali/Desktop/Long Covid/minigene_design/input/2020135/analyses/2020135/05_vep/tables/high_confidence/2020135_hc_vep.txt'

In [None]:
for sample in samples:
    df = results[sample].copy()
    mut_dict = create_result_list(df)
    mutations = find(mut_dict["variant_class"], "insertion")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            ref_seqs, var_seqs = get_sequences_indel(
                mut_info=extract_result(mut_dict, mut),
                exon_info=exon_info,
                fasta=refgen,
            )
    mutations = find(mut_dict["variant_class"], "deletion")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            ref_seqs, var_seqs = get_sequences_indel(
                mut_info=extract_result(mut_dict, mut),
                exon_info=exon_info,
                fasta=refgen,
            )
    mutations = find(mut_dict["variant_class"], "SNV")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            ref_seqs, var_seqs = get_sequences_substitution(
                mut_info=extract_result(mut_dict, mut),
                exon_info=exon_info,
                fasta=refgen,
            )
    mutations = find(mut_dict["variant_class"], "substitution")
    if len(mutations) > 0:
        for mut in tqdm(mutations):
            ref_seqs, var_seqs = get_sequences_substitution(
                mut_info=extract_result(mut_dict, mut),
                exon_info=exon_info,
                fasta=refgen,
            )

100%|██████████| 5/5 [00:00<00:00, 1646.37it/s]
100%|██████████| 102/102 [00:00<00:00, 2337.83it/s]
100%|██████████| 7/7 [00:00<00:00, 970.45it/s]
100%|██████████| 88/88 [00:00<00:00, 3856.23it/s]
100%|██████████| 3/3 [00:00<00:00, 1848.53it/s]
100%|██████████| 67/67 [00:00<00:00, 4835.39it/s]
100%|██████████| 2/2 [00:00<00:00, 1430.53it/s]
100%|██████████| 25/25 [00:00<00:00, 3808.71it/s]
100%|██████████| 1/1 [00:00<00:00, 1326.47it/s]
100%|██████████| 4/4 [00:00<00:00, 1059.03it/s]
100%|██████████| 24/24 [00:00<00:00, 3078.48it/s]
100%|██████████| 1/1 [00:00<00:00, 604.19it/s]
100%|██████████| 31/31 [00:00<00:00, 3525.01it/s]
100%|██████████| 4/4 [00:00<00:00, 731.26it/s]
100%|██████████| 17/17 [00:00<00:00, 835.38it/s]
100%|██████████| 3711/3711 [00:00<00:00, 4249.83it/s]
100%|██████████| 432/432 [00:00<00:00, 5268.17it/s]
100%|██████████| 1/1 [00:00<00:00, 1392.99it/s]
100%|██████████| 10/10 [00:00<00:00, 1009.39it/s]
100%|██████████| 1107/1107 [00:00<00:00, 4044.25it/s]
100%|█████

In [None]:
seq = "TTCGCTCAAGGCTGTGGGTCCACCACCCCAAACCCCTCCACGAAGACACAGGGGCCTGCAGGCTGCCCGGCCAGCGGAGCCCACCCTACCCAGT"


In [2]:
import os

# os.chdir("/scratch/project/tcr_neoantigen/misc/jaz/scripts")

from pathlib import Path
from pyfaidx import Fasta
from tqdm import tqdm
import pandas as pd

varient_csv = Path("/Users/Ali/Desktop/Long Covid/minigene_design/input/final_minigenes_variant_v2.csv")
control_csv = Path("/Users/Ali/Desktop/Long Covid/minigene_design/input/final_minigenes_control_v2.csv")
df1 = pd.read_csv(varient_csv)
df2 = pd.read_csv(control_csv)

In [27]:
# situation 1: GAA GAC, CTT CTG, GTC TTC, GTC TTC, CAG AAG
dict1={"GAA" : "GAG",
"CTT" : "CTA",
"GTC" : "GTA",
"CAG" : "CAA"}
# situation 2: _GA AGAC, _CT TCT G, _GT CTT C, _CA GAA G
dict2={
"AGA" : "AGG",
"GGA" : "GGG",
"CGA" : "CGG",
"TGA" : "TAG",
"ACT" : "ACC",
"GCT" : "GCC",
"CCT" : "CCC",
"TCT" : "TCC",
"AGT" : "AGC",
"GGT" : "GGC",
"CGT" : "CGC",
"TGT" : "TGC",
"ACA" : "ACC",
"GCA" : "GCC",
"CCA" : "CCC",
"TCA" : "TCC"}

# situation 3: __G AAG AC, __C TTC TG, __G TCT TC, __C AGA AG
dict3={"AAG" : "AAA",
"ACG" : "ACT",
"AGG" : "AGA",
"TAG" : "TGA",
"TTG" : "TTA",
"TCG" : "TCT",
"CAG" : "CAA",
"CTG" : "CTA",
"CGG" : "CGA",
"GAG" : "GAA",
"GTG" : "GTA",
"GCG" : "GCT",
"GGG" : "GGA",
"AAC" : "AAT",
"ACC" : "ACT",
"AGC" : "AGT",
"ATC" : "ATT",
"TAC" : "TAT",
"TTC" : "TTT",
"TCC" : "TCT",
"TGC" : "TGT",
"CAC" : "CAT",
"CTC" : "CTT",
"CGC" : "CGT",
"CCC" : "CCT",
"GAC" : "GAT",
"GTC" : "GTT",
"GCC" : "GCT",
"GGC" : "GGT"}

# situation 4: ATG, TGG
dict4={"AAG" : "AAA",
"TCT" : "TCC"}


In [29]:
import re
bbs1_varient = []
for seq in df1["minigene"]:
	for x in ["GAAGAC","CTTCTG","GTCTTC","CAGAAG",]:
		if re.search(x, seq):
			# first need to find the location number of the sequence and see if that number is divisible by 3
			position = seq.find(x)
			if position % 3 == 0:
				# situation 1
				pos = position
				new_seq = seq[:pos] + dict1[seq[pos:pos+3]] + seq[pos+3:]
			else:
				if (position - 1) % 3 == 0:
					# situation 2
					pos = position - 1
					new_seq = seq[:pos] + dict2[seq[pos:pos+3]] + seq[pos+3:]
				else:
					# situation 3
					pos = position - 2
					if seq[pos:pos+3] in dict2:
						new_seq = seq[:pos] + dict3[seq[pos:pos+3]] + seq[pos+3:]
					else:
						# becomes situation 2
						pos = position + 1
						new_seq = seq[:pos] + dict4[seq[pos:pos+3]] + seq[pos+3:]
								
		else:
			new_seq = "N/A"
	bbs1_varient.append(new_seq)


df1["modified_minigene"] = bbs1_varient
# Save the DataFrame to a CSV file
output_csv = Path("/Users/Ali/Desktop/Long Covid/minigene_design/output/modified_minigenes_variant_v2.csv")
df1.to_csv(output_csv, index=False)


KeyError: 'TTC'

In [None]:
df1

Unnamed: 0,reference,variant,amino_acid_ref,amino_acid_var,codon_ref,codon_var,chromosome,mutation_location,mutation_location2,variant_class,gene_id,protein_id,transcript_id,gene_symbol,strand,minigene,minigene_id,modified_minigene
0,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...,2020135_14_ENSP00000311684.7_0_0_var,
1,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGGA...,2020135_14_ENSP00000311684.7_0_1_var,
2,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAGC...,2020135_14_ENSP00000311684.7_0_2_var,
3,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTGA...,2020135_14_ENSP00000311684.7_0_3_var,
4,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGCA...,2020135_14_ENSP00000311684.7_0_4_var,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190644,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGA...,DES010_195_ENSP00000334733.7_1_0_var,
190645,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GGCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGG...,DES010_195_ENSP00000334733.7_2_0_var,
190646,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGAT...,DES010_197_ENSP00000164640.4_0_0_var,
190647,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GGCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGA...,DES010_197_ENSP00000164640.4_1_0_var,


In [None]:
import re
bbs1_varient = []
for seq in df2["minigene"]:
	for x in ["GAAGAC","CTTCTG","GTCTTC","CAGAAG",]:
		if re.search(x, seq):
			# first need to find the location number of the sequence and see if that number is divisible by 3
			position = seq.find(x)
			if position % 3 == 0:
				# situation 1
				pos = position
				new_seq = seq[:pos] + dict1[seq[pos:pos+3]] + seq[pos+3:]
				# print("1",new_seq )
			else:
				if (position - 1) % 3 == 0:
					# situation 2
					pos = position - 1
					new_seq = seq[:pos] + dict2[seq[pos:pos+3]] + seq[pos+3:]
					# print("2",new_seq )
				else:
					# situation 3
					pos = position - 2
					if seq[pos:pos+3] in dict2:
						new_seq = seq[:pos] + dict3[seq[pos:pos+3]] + seq[pos+3:]
						# print("3",new_seq )
					else:
						# becomes situation 2
						pos = position - 1
						new_seq = seq[:pos] + dict2[seq[pos:pos+3]] + seq[pos+3:]
						# print("4",new_seq )		
		else:
			new_seq = "N/A"
	bbs1_varient.append(new_seq)


df2["modified_minigene"] = bbs1_varient
# Save the DataFrame to a CSV file
output_csv = Path("/Users/Ali/Desktop/Long Covid/minigene_design/output/final_minigenes_control_v2.csv")
df2.to_csv(output_csv, index=False)



In [None]:
df2

Unnamed: 0,reference,variant,amino_acid_ref,amino_acid_var,codon_ref,codon_var,chromosome,mutation_location,mutation_location2,variant_class,gene_id,protein_id,transcript_id,gene_symbol,strand,minigene,minigene_id,modified_minigene
0,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCACTGAGGATGAGGCCCTGGGGACCCCAGAGACTGGGGCTGCCAA...,2020135_14_ENSP00000311684.7_0_0_ref,
1,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,CCCATGGACTGGCAGGAGCAGGGAAGGGCTCCCTCTCAGGACCAGG...,2020135_14_ENSP00000311684.7_0_1_ref,
2,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GCTCCCAGCCCAGAGGCCCTCCCCTCCCCAGGCCAGGAGCCCGCAG...,2020135_14_ENSP00000311684.7_0_2_ref,
3,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,GGGGCTAGCCCCAGGCGGGGAGAGCTCCGCAGGGGCAGCTCGGCTG...,2020135_14_ENSP00000311684.7_0_3_ref,
4,C,-,T,X,aCc,ac,chr2,219483369,219483369,deletion,ENSG00000072195,ENSP00000311684,ENST00000312358,SPEG,1,AGCGCCCTGCCCCGGGCCGGGCCGCGGGAGCTGGGCCGGGGCCTGC...,2020135_14_ENSP00000311684.7_0_4_ref,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190649,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGGG...,DES010_195_ENSP00000334733.7_1_0_ref,
190650,CC,TT,G,K,GGa,AAa,chrX,108221301,108221302,substitution,ENSG00000197565,ENSP00000334733,ENST00000334504,COL4A6,-1,GGCCCAACAGGTCCTCAAGGATTCACTGGCTCTACTGGTTTATCGG...,DES010_195_ENSP00000334733.7_2_0_ref,
190651,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGGG...,DES010_197_ENSP00000164640.4_0_0_ref,
190652,CC,AT,G,I,GGc,ATc,chrX,153804140,153804141,substitution,ENSG00000067840,ENSP00000164640,ENST00000164640,PDZD4,-1,GGCGGGCCATGGCCGGCAACTCCAACTTGAACCGGACCCCTCCCGG...,DES010_197_ENSP00000164640.4_1_0_ref,
