In [2]:
from datasets import load_dataset 

In [3]:
ds = load_dataset("laion/biorXiv_metadata")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['doi', 'title', 'authors', 'author_corresponding', 'author_corresponding_institution', 'date', 'version', 'type', 'license', 'category', 'jatsxml', 'abstract', 'published', 'server'],
        num_rows: 353648
    })
})


In [7]:
train = ds["train"]
train

Dataset({
    features: ['doi', 'title', 'authors', 'author_corresponding', 'author_corresponding_institution', 'date', 'version', 'type', 'license', 'category', 'jatsxml', 'abstract', 'published', 'server'],
    num_rows: 353648
})

In [39]:
import re

def normalize(x: str):
	x = re.sub(r'\s', ' ', x)
	x = x.lower()

	allowed_chars = set(list(" abcdefghijklmnopqrstuvwxyz?<>!.,:;()-+=&$*%^@1234567890/–'\""))
	result = ""
	for c in x:
		if c in allowed_chars:
			result += c
	return result

In [40]:
print(train[:10]["title"])
[normalize(x) for x in train[:10]["title"]]

['Population genomics of Saccharomyces cerevisiae human isolates: passengers, colonizers, invaders.', 'Estimating seed bank accumulation and dynamics in three obligate-seeder Proteaceae species', 'How and where to look for tRNAs in Metazoan mitochondrial genomes, and what you might find when you get there', 'How and where to look for tRNAs in Metazoan mitochondrial genomes, and what you might find when you get there', 'Tracking global changes induced in the CD4 T cell receptor repertoire by immunization with a complex antigen using short stretches of CDR3 protein sequence.', 'The shrinking human protein coding complement: are there fewer than 20,000 genes?', 'Emergence of structural and dynamical properties of ecological mutualistic networks', 'Expertly validated models suggest responses to climate change are related to species traits: a phylogenetically-controlled analysis of the Order Lagomorpha', 'Expertly validated models suggest responses to climate change are related to species t

['population genomics of saccharomyces cerevisiae human isolates: passengers, colonizers, invaders.',
 'estimating seed bank accumulation and dynamics in three obligate-seeder proteaceae species',
 'how and where to look for trnas in metazoan mitochondrial genomes, and what you might find when you get there',
 'how and where to look for trnas in metazoan mitochondrial genomes, and what you might find when you get there',
 'tracking global changes induced in the cd4 t cell receptor repertoire by immunization with a complex antigen using short stretches of cdr3 protein sequence.',
 'the shrinking human protein coding complement: are there fewer than 20,000 genes?',
 'emergence of structural and dynamical properties of ecological mutualistic networks',
 'expertly validated models suggest responses to climate change are related to species traits: a phylogenetically-controlled analysis of the order lagomorpha',
 'expertly validated models suggest responses to climate change are related to s

In [45]:
from tqdm import tqdm
unique_chars = set()
for t in tqdm(train):
	for c in t["title"]:
		unique_chars.add(c)

100%|██████████| 353648/353648 [00:49<00:00, 7141.56it/s]


In [46]:
unique_chars

{'\t',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 '\xa0',
 '®',
 '°',
 '´',
 '·',
 'Å',
 'Ô',
 '×',
 'á',
 'æ',
 'ç',
 'è',
 'é',
 'í',
 'ï',
 'ô',
 'ö',
 'ü',
 'ı',
 'ĸ',
 'Ō',
 'ɛ',
 'ʟ',
 'ʹ',
 '̈',
 'Β',
 'Γ',
 'Δ',
 'Ρ',
 'Τ',
 'Φ',
 'α',
 'β',
 'γ',
 'δ',
 'ε',
 'ζ',
 'η',
 'θ',
 'κ',
 'μ',
 'π',
 'σ',
 'φ',
 'χ',
 'ω',
 'ϵ',
 'қ',
 '\u200b',
 '\u200e',
 '‐',
 '‑',
 '–',
 '—',
 '‘',
 '’',
 '“',
 '”',
 '…',
 '\u2029',
 '′',
 '″',
 '℃',
 '™',
 '→',
 '∆',
 '−',
 '

In [65]:
def tokenizer_encode(strings: list[str], alphabet: str):
	index_map = {
		"<s>": 0,
		"<e>": 1,
	}
	for i in range(len(alphabet)):
		index_map[alphabet[i]] = i+2
	
	res = []
	for s in strings:
		sub = [0] # <s>
		for c in s:
			sub.append(index_map[c])
		sub.append(1) # <e>
		res.append(sub)
	return res

def tokenizer_decode(idxs: list[list[int]], alphabet: str):
	reverse_index_map = ["<s>", "<e>"]
	reverse_index_map.extend(alphabet)


	res = []
	for sub in idxs:
		string = ""
		for i in sub:
			string += reverse_index_map[i]
		res.append(string)
	return res

alphabet=[' ','a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','?','!','.',',',':',';','&','%','1','2','3','4','5','6','7','8','9','0']

enc = tokenizer_encode(["hello there!", "what is up!"], alphabet)
print(enc)
tokenizer_decode(enc, alphabet)

[[0, 10, 7, 14, 14, 17, 2, 22, 10, 7, 20, 7, 30, 1], [0, 25, 10, 3, 22, 2, 11, 21, 2, 23, 18, 30, 1]]


['<s>hello there!<e>', '<s>what is up!<e>']