In [1]:
import os
import pickle
import numpy as np
import tiktoken

input_file_path = 'dataset/cmu_plots.txt'
output_dir = "dataset/processed"
os.makedirs(output_dir, exist_ok=True)

with open(input_file_path, 'r', encoding='utf-8') as f:
    text = f.read()

print(f"Duljina korpusa: {len(text):,} znakova")

Duljina korpusa: 75,507,588 znakova


In [2]:
# Skup svih jedinstvenih znakova
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f"Vokabular: {vocab_size} znakova")

Vokabular: 587 znakova


In [3]:
chars = sorted(list(set(text)))
print(chars)

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\x92', '\xa0', '¡', '¢', '£', '¥', '¦', '«', '\xad', '¯', '°', '²', '´', '»', '¼', '½', '¿', 'À', 'Á', 'Â', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', 'Í', 'Î', 'Ñ', 'Ó', 'Ô', 'Ö', '×', 'Ø', 'Ú', 'Ü', 'Þ', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'å', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'õ', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'ý', 'Ā', 'ā', 'ă', 'ą', 'Ć', 'ć', 'Ċ', 'Č', 'č', 'Đ', 'đ', 'ē', 'ę', 'ě', 'ğ', 'ġ', 'Ħ', 'ĩ', 'ī', 'İ', 'ı', 'ĺ', 'Ł', 'ł', 'ń', 'ņ', 'ŋ', 'Ō', 'ō', 'ŏ', 'Ő', 'ő', 'œ'

In [4]:
# Analiza vokabulara
import string
import unicodedata

alphabetic = [c for c in chars if c.isalpha()]
numeric = [c for c in chars if c.isnumeric()]
punctuation = [c for c in chars if c in string.punctuation]
whitespace = [c for c in chars if c.isspace()]
other = [c for c in chars if not (c.isalpha() or c.isnumeric() or c in string.punctuation or c.isspace())]

print(f"Alfabetski znakovi: {len(alphabetic)} ({len(alphabetic)/len(chars)*100:.1f}%)")
print(f"Numerički znakovi: {len(numeric)} ({len(numeric)/len(chars)*100:.1f}%)")
print(f"Interpunkcija: {len(punctuation)} ({len(punctuation)/len(chars)*100:.1f}%)")
print(f"Razmaci: {len(whitespace)} ({len(whitespace)/len(chars)*100:.1f}%)")
print(f"Ostali znakovi: {len(other)} ({len(other)/len(chars)*100:.1f}%)")

print(f"\nPrvih 20 alfabetskih: {alphabetic[:20]}")
print(f"Svi numerički: {numeric}")
print(f"Prvih 20 interpunkcijskih: {punctuation[:20]}")
print(f"Svi razmaci (repr): {[repr(c) for c in whitespace]}")
print(f"Prvih 30 ostalih: {other[:30]}")

Alfabetski znakovi: 455 (77.5%)
Numerički znakovi: 18 (3.1%)
Interpunkcija: 32 (5.5%)
Razmaci: 4 (0.7%)
Ostali znakovi: 81 (13.8%)

Prvih 20 alfabetskih: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T']
Svi numerički: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '²', '¼', '½', '⅓', '⅞', '一', '三', '五']
Prvih 20 interpunkcijskih: ['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>']
Svi razmaci (repr): ["'\\n'", "' '", "'\\xa0'", "'\\u3000'"]
Prvih 30 ostalih: ['\x92', '¡', '¢', '£', '¥', '¦', '«', '\xad', '¯', '°', '´', '»', '¿', '×', '˝', '̧', 'ँ', 'ं', 'ा', 'ि', 'ी', 'ु', 'ू', 'े', 'ै', 'ो', '्', 'ি', 'ু', '্']


In [6]:
# Tokenizacija teksta
enc = tiktoken.get_encoding("gpt2")
ids = enc.encode(text)
print(f"Ukupno tokena: {len(ids):,}")

Ukupno tokena: 16,661,114


In [7]:
print(ids[:20])
print(enc.decode(ids[:20]))

[2484, 306, 21862, 11, 257, 1327, 12, 16090, 17536, 4639, 290, 9334, 418, 3099, 11, 257, 46909, 48982, 396, 11]
Shlykov, a hard-working taxi driver and Lyosha, a saxophonist,


In [8]:
print(f"Tiktoken GPT-2 vokabular: {enc.n_vocab:,} tokena")

# Najčešći tokeni
from collections import Counter
token_counts = Counter(ids)
most_common = token_counts.most_common(20)
least_common = token_counts.most_common()[-20:]

print("\nNajčešći tokeni:")
for token_id, count in most_common:
    token_str = enc.decode([token_id])
    print(f"Token {token_id}: '{token_str}' -> {count:,} puta")

print("\nNajrijeđi tokeni:")
for token_id, count in least_common:
    token_str = enc.decode([token_id])
    print(f"Token {token_id}: '{token_str}' -> {count:,} puta")


Tiktoken GPT-2 vokabular: 50,257 tokena

Najčešći tokeni:
Token 11: ',' -> 738,159 puta
Token 262: ' the' -> 736,100 puta
Token 13: '.' -> 655,506 puta
Token 284: ' to' -> 478,402 puta
Token 290: ' and' -> 455,337 puta
Token 257: ' a' -> 362,907 puta
Token 286: ' of' -> 261,135 puta
Token 318: ' is' -> 224,176 puta
Token 287: ' in' -> 201,670 puta
Token 465: ' his' -> 190,688 puta
Token 607: ' her' -> 148,273 puta
Token 338: ''s' -> 145,131 puta
Token 339: ' he' -> 140,012 puta
Token 326: ' that' -> 137,095 puta
Token 351: ' with' -> 134,778 puta
Token 683: ' him' -> 96,938 puta
Token 329: ' for' -> 93,884 puta
Token 416: ' by' -> 91,253 puta
Token 12: '-' -> 84,094 puta
Token 319: ' on' -> 77,868 puta

Najrijeđi tokeni:
Token 46968: ' convol' -> 1 puta
Token 30228: 'Personal' -> 1 puta
Token 39814: 'olulu' -> 1 puta
Token 43720: ' rand' -> 1 puta
Token 41570: 'forestation' -> 1 puta
Token 41752: ' PASS' -> 1 puta
Token 40744: 'Manchester' -> 1 puta
Token 29050: ' exported' -> 1 puta
T

In [9]:
n = len(ids)
train_ids = np.array(ids[:int(n * 0.9)], dtype=np.uint16)
val_ids = np.array(ids[int(n * 0.9):], dtype=np.uint16)

train_ids.tofile(os.path.join(output_dir, "train.bin"))
val_ids.tofile(os.path.join(output_dir, "val.bin"))

print("Tokenizacija završena i spremljena.")


Tokenizacija završena i spremljena.
