In [2]:
import pandas as pd

In [3]:
df = pd.read_parquet("../data/swissprot-embeddings.parquet")

In [34]:
df = df.dropna().copy()

In [35]:
df["sequence_length"] = df["sequence"].apply(len)

In [37]:
df.set_index("accession")

Unnamed: 0_level_0,name,description,sequence,embedding,sequence_length
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Q6GZX4,001R_FRG3G,Putative transcription factor 001R OS=Frog vir...,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,"[-0.082492255, -0.1319762, 0.115043, 0.2082809...",256
Q6GZX3,002L_FRG3G,Uncharacterized protein 002L OS=Frog virus 3 (...,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,"[0.035897277, 0.056123216, 0.06703286, 0.14852...",320
Q197F8,002R_IIV3,Uncharacterized protein 002R OS=Invertebrate i...,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,"[-0.07549502, -0.033397272, -0.051575065, -0.0...",458
Q197F7,003L_IIV3,Uncharacterized protein 003L OS=Invertebrate i...,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,"[0.05108296, -0.36502212, -0.09640725, 0.16490...",156
Q6GZX2,003R_FRG3G,Uncharacterized protein 3R OS=Frog virus 3 (is...,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,"[0.02195684, 0.010273599, 0.14135766, 0.000406...",438
...,...,...,...,...,...
Q6UY62,Z_SABVB,RING finger protein Z OS=Sabia mammarenavirus ...,MGNSKSKSKLSANQYEQQTVNSTKQVAILKRQAEPSLYGRHNCRCC...,"[-0.010571569, -0.08823345, 0.0424352, 0.06595...",100
P08105,Z_SHEEP,Putative uncharacterized protein Z OS=Ovis ari...,MSSSLEITSFYSFIWTPHIGPLLFGIGLWFSMFKEPSHFCPCQHPH...,"[-0.039514273, -0.3319587, 0.09509317, 0.11412...",79
Q88470,Z_TACVF,RING finger protein Z OS=Tacaribe virus (strai...,MGNCNRTQKPSSSSNNLEKPPQAAEFRRTAEPSLYGRYNCKCCWFA...,"[0.0062101833, -0.14893793, -0.0045073153, 0.0...",95
A9JR22,Z_TAMVU,RING finger protein Z OS=Tamiami mammarenaviru...,MGLRYSKEVRDRHGDKDPEGRIPITQTMPQTLYGRYNCKSCWFANK...,"[-0.018034836, -0.11024573, 0.015278675, 0.199...",95


In [42]:
desc = df.iloc[0]["description"]

In [43]:
desc

'Putative transcription factor 001R OS=Frog virus 3 (isolate Goorha) OX=654924 GN=FV3-001R PE=4 SV=1'

In [134]:
def parse_uniprot_fasta_header(d):
	# parse
	output = {}
	if "GN=" in d:
		sep = ["OS=", "OX=", "GN=", "PE=", "SV="]
	else:
		sep = ["OS=", "OX=", "PE=", "SV="]
	idxs = [d.index(s) for s in sep]
	for i in range(0, len(sep)-1, 1):
		output[sep[i][:2]] = d[idxs[i]+3:idxs[i+1]].strip()
	# process further
	output["description"] = d[:idxs[0]].strip()
	output["SV"] = d[idxs[-1]+3:].strip()

	# we won't add this since it's optional and I hate null values anywhere!
	if "GN=" not in d:
		output["GN"] = None
	output["PE"] = int(output["PE"])
	output["OX"] = int(output["OX"])
	output["SV"] = int(output["SV"])
	return output
		
parse_uniprot_fasta_header(desc)

{'OS': 'Frog virus 3 (isolate Goorha)',
 'OX': 654924,
 'GN': 'FV3-001R',
 'PE': 4,
 'description': 'Putative transcription factor 001R',
 'SV': 1}

In [131]:
df["OrganismName"] = df["description"].apply(lambda x: parse_uniprot_fasta_header(x)["OS"])
df["OrganismIdentifier"] = df["description"].apply(lambda x: parse_uniprot_fasta_header(x)["OX"])
df["ProteinExistence"] = df["description"].apply(lambda x: parse_uniprot_fasta_header(x)["PE"])
df["SequenceVersion"] = df["description"].apply(lambda x: parse_uniprot_fasta_header(x)["SV"])
df["ProteinName"] = df["description"].apply(lambda x: parse_uniprot_fasta_header(x)["description"])

In [135]:
df["GeneName"] = df["description"].apply(lambda x: parse_uniprot_fasta_header(x)["GN"])

In [142]:
df.rename(columns={"OrganismName": "organism_name", 
				   "OrganismIdentifier": "organism_identifier", 
				   "ProteinExistence": "protein_existence", 
				   "SequenceVersion": "sequence_version", 
				   "ProteinName": "protein_name", 
				   "GeneName": "gene_name"})

Unnamed: 0,accession,name,description,sequence,embedding,sequence_length,organism_name,organism_identifier,protein_existence,sequence_version,protein_name,gene_name
0,Q6GZX4,001R_FRG3G,Putative transcription factor 001R OS=Frog vir...,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,"[-0.082492255, -0.1319762, 0.115043, 0.2082809...",256,Frog virus 3 (isolate Goorha),654924,4,1,Putative transcription factor 001R,FV3-001R
1,Q6GZX3,002L_FRG3G,Uncharacterized protein 002L OS=Frog virus 3 (...,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,"[0.035897277, 0.056123216, 0.06703286, 0.14852...",320,Frog virus 3 (isolate Goorha),654924,4,1,Uncharacterized protein 002L,FV3-002L
2,Q197F8,002R_IIV3,Uncharacterized protein 002R OS=Invertebrate i...,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,"[-0.07549502, -0.033397272, -0.051575065, -0.0...",458,Invertebrate iridescent virus 3,345201,4,1,Uncharacterized protein 002R,IIV3-002R
3,Q197F7,003L_IIV3,Uncharacterized protein 003L OS=Invertebrate i...,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,"[0.05108296, -0.36502212, -0.09640725, 0.16490...",156,Invertebrate iridescent virus 3,345201,4,1,Uncharacterized protein 003L,IIV3-003L
4,Q6GZX2,003R_FRG3G,Uncharacterized protein 3R OS=Frog virus 3 (is...,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,"[0.02195684, 0.010273599, 0.14135766, 0.000406...",438,Frog virus 3 (isolate Goorha),654924,3,1,Uncharacterized protein 3R,FV3-003R
...,...,...,...,...,...,...,...,...,...,...,...,...
570152,Q6UY62,Z_SABVB,RING finger protein Z OS=Sabia mammarenavirus ...,MGNSKSKSKLSANQYEQQTVNSTKQVAILKRQAEPSLYGRHNCRCC...,"[-0.010571569, -0.08823345, 0.0424352, 0.06595...",100,Sabia mammarenavirus (isolate Human/Brasil/SPH...,2169992,1,1,RING finger protein Z,Z
570153,P08105,Z_SHEEP,Putative uncharacterized protein Z OS=Ovis ari...,MSSSLEITSFYSFIWTPHIGPLLFGIGLWFSMFKEPSHFCPCQHPH...,"[-0.039514273, -0.3319587, 0.09509317, 0.11412...",79,Ovis aries,9940,4,1,Putative uncharacterized protein Z,
570154,Q88470,Z_TACVF,RING finger protein Z OS=Tacaribe virus (strai...,MGNCNRTQKPSSSSNNLEKPPQAAEFRRTAEPSLYGRYNCKCCWFA...,"[0.0062101833, -0.14893793, -0.0045073153, 0.0...",95,Tacaribe virus (strain Franze-Fernandez),928313,1,3,RING finger protein Z,Z
570155,A9JR22,Z_TAMVU,RING finger protein Z OS=Tamiami mammarenaviru...,MGLRYSKEVRDRHGDKDPEGRIPITQTMPQTLYGRYNCKSCWFANK...,"[-0.018034836, -0.11024573, 0.015278675, 0.199...",95,Tamiami mammarenavirus (isolate Rat/United Sta...,45223,3,1,RING finger protein Z,Z


In [144]:
df = df.rename(columns={"OrganismName": "organism_name", 
				   "OrganismIdentifier": "organism_identifier", 
				   "ProteinExistence": "protein_existence", 
				   "SequenceVersion": "sequence_version", 
				   "ProteinName": "protein_name", 
				   "GeneName": "gene_name",
				  "embedding": "embedding_esm2_t6_8M_UR50D" 
				  })

In [149]:
formatted = df[["accession", "protein_name", "organism_name", "organism_identifier", "gene_name", "protein_existence", "sequence_version", "sequence", "sequence_length", "embedding_esm2_t6_8M_UR50D"]].copy()

In [151]:
formatted.to_parquet("../data/swissprot-embeddings-v2.parquet")