# Training dataset: download, clean, examine

## Download Uniprot database

In [64]:
from uniprot_download_and_clean import main as download_and_clean_main
download_and_clean_main()


INFO: Current working directory: /workspace/uniprot_data
INFO: Parsing FASTA into DataFrame (may take a while for large files)...
INFO: Parsed 573661 sequences from FASTA
INFO: Reading TSV: ./.__tmp_uniprot_in.tsv
INFO: TSV loaded: 573661 rows, 3 cols
INFO: Dropped duplicates: 573661 -> 573661
INFO: Dropping 9318 fragment entries (protein_name contains 'fragment')
INFO: Dropping 1698 invalid sequences (stops or non-standard AA).
INFO: Cleaning complete: 562645 rows remaining
INFO: Writing 562645 FASTA records to uniprot_data/uniprot_sprot_cleaned.fasta
INFO: Bulk-download cleaning finished; outputs in uniprot_data


## Add species names

In [65]:
import pandas as pd
import numpy as np
df = pd.read_csv('uniprot_data/uniprot_sprot_cleaned.tsv', sep="\t", dtype=str, na_filter=False)
df['species_raw'] = df['protein_name'].apply(lambda x: x.split('OS=')[-1].split(' OX=')[0].strip() if 'OS=' in x else '')
df['species'] = df['species_raw'].apply(lambda x: ' '.join(x.split(' ')[:2]))
df

Unnamed: 0,accession,protein_name,sequence,species_raw,species
0,001R_FRG3G,sp|Q6GZX4|001R_FRG3G Putative transcription fa...,MAFSAEDVLKEYDRRRRMEALLLSLYYPNDRKLLDYKEWSPPRVQV...,Frog virus 3 (isolate Goorha),Frog virus
1,002L_FRG3G,sp|Q6GZX3|002L_FRG3G Uncharacterized protein 0...,MSIIGATRLQNDKSDTYSAGPCYAGGCSAFTPRGTCGKDWDLGEQT...,Frog virus 3 (isolate Goorha),Frog virus
2,002R_IIV3,sp|Q197F8|002R_IIV3 Uncharacterized protein 00...,MASNTVSAQGGSNRPVRDFSNIQDVAQFLLFDPIWNEQPGSIVPWK...,Invertebrate iridescent virus 3,Invertebrate iridescent
3,003L_IIV3,sp|Q197F7|003L_IIV3 Uncharacterized protein 00...,MYQAINPCPQSWYGSPQLEREIVCKMSGAPHYPNYYPVHPNALGGA...,Invertebrate iridescent virus 3,Invertebrate iridescent
4,003R_FRG3G,sp|Q6GZX2|003R_FRG3G Uncharacterized protein 3...,MARPLLGKTSSVRRRLESLSACSIFFFLRKFCQKMASLVFLNSPVY...,Frog virus 3 (isolate Goorha),Frog virus
...,...,...,...,...,...
562640,Z_SABVB,sp|Q6UY62|Z_SABVB RING finger protein Z OS=Sab...,MGNSKSKSKLSANQYEQQTVNSTKQVAILKRQAEPSLYGRHNCRCC...,Sabia mammarenavirus (isolate Human/Brasil/SPH...,Sabia mammarenavirus
562641,Z_SHEEP,sp|P08105|Z_SHEEP Putative uncharacterized pro...,MSSSLEITSFYSFIWTPHIGPLLFGIGLWFSMFKEPSHFCPCQHPH...,Ovis aries,Ovis aries
562642,Z_TACVF,sp|Q88470|Z_TACVF RING finger protein Z OS=Tac...,MGNCNRTQKPSSSSNNLEKPPQAAEFRRTAEPSLYGRYNCKCCWFA...,Tacaribe virus (strain Franze-Fernandez),Tacaribe virus
562643,Z_TAMVU,sp|A9JR22|Z_TAMVU RING finger protein Z OS=Tam...,MGLRYSKEVRDRHGDKDPEGRIPITQTMPQTLYGRYNCKSCWFANK...,Tamiami mammarenavirus (isolate Rat/United Sta...,Tamiami mammarenavirus


## Select the right species

In [66]:
import numpy as np
def constrain_species_names(df, n_top=10):
    names_common = list(df['species'].value_counts().index[:n_top])

    names_funky = ['Rhodotorula toruloides',
                   'Staphylococcus aureus',
                   'Saccharolobus solfataricus']
    names_funky_but_missing = ['Mesorhizobium opportunistum']

    names_selected = names_common + names_funky
    
    return df[df['species'].isin(names_selected)]

df_spec = constrain_species_names(df)

df_spec

Unnamed: 0,accession,protein_name,sequence,species_raw,species
165,10D1B_MOUSE,sp|Q60888|10D1B_MOUSE Olfactory receptor 10D1B...,MKNLSVVTQFILLGIPHTEGVETMLFVLFFSFYIFTLVGNLLILLA...,Mus musculus,Mus musculus
166,10H28_MOUSE,sp|Q8VBW9|10H28_MOUSE Olfactory receptor 10H28...,MPGQNYSTISEFILFGFSAFPHQMLPALFLLYLLMYLFTLLGNLVI...,Mus musculus,Mus musculus
169,10P22_MOUSE,sp|Q60885|10P22_MOUSE Olfactory receptor 10P22...,MGDDNDTDITEFILLGFSGYGFLQGHLFWGVLCIYVVTLLGNSLIV...,Mus musculus,Mus musculus
277,14310_ARATH,sp|P48347|14310_ARATH 14-3-3-like protein GF14...,MENEREKQVYLAKLSEQTERYDEMVEAMKKVAQLDVELTVEERNLV...,Arabidopsis thaliana,Arabidopsis thaliana
279,14311_ARATH,sp|Q9S9Z8|14311_ARATH 14-3-3-like protein GF14...,MENERAKQVYLAKLNEQAERYDEMVEAMKKVAALDVELTIEERNLL...,Arabidopsis thaliana,Arabidopsis thaliana
...,...,...,...,...,...
562617,ZYX_MOUSE,sp|Q62523|ZYX_MOUSE Zyxin OS=Mus musculus OX=1...,MAAPRPPPAISVSVSAPAFYAPQKKFAPVVAPKPKVNPFRPGDSEP...,Mus musculus,Mus musculus
562620,ZZEF1_HUMAN,sp|O43149|ZZEF1_HUMAN Zinc finger ZZ-type and ...,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,Homo sapiens,Homo sapiens
562621,ZZEF1_MOUSE,sp|Q5SSH7|ZZEF1_MOUSE Zinc finger ZZ-type and ...,MGNAPSNSSEDEAAAAGGEGWSPHQDWAADSGTTPGPGPAAAVLPS...,Mus musculus,Mus musculus
562622,ZZZ3_HUMAN,sp|Q8IYH5|ZZZ3_HUMAN ZZ-type zinc finger-conta...,MAASRSTRVTRSTVGLNGLDESFCGRTLRNRSIAHPEEISSNSQVR...,Homo sapiens,Homo sapiens


## Save to TSV

In [67]:
df_spec.to_csv('uniprot_data/uniprot_sprot_cleaned_selected_species.tsv', sep="\t", index=False)