Imports

In [14]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
import torch
import re

Constants

In [3]:
UNIREF50_PATH = 'uniref50_20_512_oneliner_noheader.fasta'
SAVE_PATH = 'uniref50/'

Functions

In [4]:
def preprocess_dataset(dataset_path):
    uniref50 = load_dataset("text", data_files=[dataset_path])
    uniref50 = uniref50.shuffle(seed=42)
    uniref50_processed = uniref50.map(lambda example: {'Seqs' :re.sub(r"[UZOB]", "X", " ".join(example['text'])), 
                                                       'length': len(example['text'])})
    return uniref50_processed

def tokenize_function(examples):
    tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
    result = tokenizer(examples["Seqs"], add_special_tokens=True, return_special_tokens_mask=True)
    return result

Get UniRef50 from UniProt

In [5]:
!wget https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz
!gzip -dk uniref50.fasta.gz

--2022-04-20 07:38:50--  https://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref50/uniref50.fasta.gz
Resolving ftp.uniprot.org (ftp.uniprot.org)... 128.175.240.195
Connecting to ftp.uniprot.org (ftp.uniprot.org)|128.175.240.195|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10492981301 (9.8G) [application/x-gzip]
Saving to: ‘uniref50.fasta.gz’


2022-04-20 07:49:09 (16.2 MB/s) - ‘uniref50.fasta.gz’ saved [10492981301/10492981301]



Install seqkit (https://github.com/shenwei356/seqkit)

In [6]:
!wget https://github.com/shenwei356/seqkit/releases/download/v2.2.0/seqkit_linux_amd64.tar.gz
!tar -xf seqkit_linux_amd64.tar.gz
!sudo cp seqkit /usr/local/bin/

--2022-04-20 07:58:49--  https://github.com/shenwei356/seqkit/releases/download/v2.2.0/seqkit_linux_amd64.tar.gz
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/52715040/35c1f176-98aa-4d0c-98b5-2c4dd0e04da9?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20220420%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20220420T075850Z&X-Amz-Expires=300&X-Amz-Signature=426cb3b859309a79d2edbf1f111d2805e4aa9dc38f06605f54c157cd4034d497&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=52715040&response-content-disposition=attachment%3B%20filename%3Dseqkit_linux_amd64.tar.gz&response-content-type=application%2Foctet-stream [following]
--2022-04-20 07:58:50--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/52715040/35c1f176-98aa-4d0c-98b5-2c4

Get only sequences of length between 20 to 512 amino acids

In [12]:
!seqkit seq -M 512 uniref50.fasta > uniref50_512.fasta
!seqkit seq -m 20 uniref50_512.fasta > uniref50_20_512.fasta
!seqkit seq uniref50_20_512.fasta -w 0 > uniref50_20_512_oneliner.fasta
!grep -v ">" uniref50_20_512_oneliner.fasta > uniref50_20_512_oneliner_noheader.fasta

[33m[WARN][0m you may switch on flag -g/--remove-gaps to remove spaces
[33m[WARN][0m you may switch on flag -g/--remove-gaps to remove spaces


Preproccessing

In [None]:
uniref50_proccessed = preprocess_dataset(UNIREF50_PATH)
uniref50_tokenized = uniref50_proccessed.map(tokenize_function, batched=True, remove_columns=["text", "Seqs"])

Save

In [None]:
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

uniref50_tokenized.save_to_disk(SAVE_PATH)

Load

In [None]:
uniref50 = load_from_disk(SAVE_PATH)
print(uniref50)