TEST TOKENIZER

In [1]:
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B
import qualified Data.ByteString.Lazy.Char8 as BL
import qualified Data.ByteString.UTF8 as BSU
import Data.Aeson (decode)
import Data.Word
import qualified Data.Map as Map
import Data.Maybe (fromMaybe)
import System.IO
import Data.List.Split (splitOn)
import Distribution.Simple
import qualified Data.Text as T
import qualified Data.Text.IO as TIO
import Data.Char (chr, ord)
import Data.Word (Word8)

type CharMap = Map.Map String Int

In [2]:
loadJSON :: FilePath -> IO (Maybe CharMap)
loadJSON filePath = do
  content <- B.readFile filePath
  return $ decode content
  
readFileToPairs :: FilePath -> String -> IO [(String, String)]
readFileToPairs filePath delimiter = do
    content <- readFile filePath
    let lines' = lines content         -- Diviser en lignes
        pairs = map (toPair delimiter) lines'   -- Convertir chaque ligne en tuple
    return pairs
  where
    -- Fonction pour convertir une ligne en tuple (s1, s2)
    toPair :: String -> String -> (String, String)
    toPair delim line = 
        case splitOn delim line of
            [first, second] -> (first, second)  -- Cas normal: deux parties
      
maybeVocab <- loadJSON "vocab.json"
pairs <- readFileToPairs "merges.txt" " "

getVocab :: Maybe CharMap -> IO CharMap
getVocab maybeVocab =
    case maybeVocab of
        Just vocab -> return vocab
        Nothing    -> do
            putStrLn "Erreur: vocab non chargé"
            return Map.empty
    
fullVocab <- getVocab maybeVocab



In [3]:
toByte :: FilePath -> IO String
toByte path = do
    bs <- BS.readFile path
    let toW8 = BS.unpack bs
    return (BSU.toString (BS.pack toW8))
    
resultToByte <- toByte "testTexte.txt"

putStrLn ("result: "  ++ result)
putStrLn "expect: Lights flicker. She waits. Time forgets to move."


: 

In [4]:
replaceSpace :: String -> String
replaceSpace input = 
    T.unpack (T.replace (T.pack " ") (T.pack "Ġ") (T.pack input))

putStrLn ("result: " ++ replaceSpace "Lights flicker. She waits. Time forgets to move.")
putStrLn "expect: LightsĠflicker.ĠSheĠwaits.ĠTimeĠforgetsĠtoĠmove."

result: LightsĠflicker.ĠSheĠwaits.ĠTimeĠforgetsĠtoĠmove.

expect: LightsĠflicker.ĠSheĠwaits.ĠTimeĠforgetsĠtoĠmove.

In [5]:
makeStrArray :: String -> [String]
makeStrArray =
    map (:[])

putStrLn "result: "
print (makeStrArray "Hi, lets tokenize")
putStrLn "expect: "
print (["H","i",","," ","l","e","t","s"," ","t","o","k","e","n","i","z","e"])

result:

["H","i",","," ","l","e","t","s"," ","t","o","k","e","n","i","z","e"]

expect:

["H","i",","," ","l","e","t","s"," ","t","o","k","e","n","i","z","e"]

In [6]:
merge :: (String, String) -> [String] -> [String]
merge _ [] = []
merge _ [x] = [x]
merge (a, b) (x1:x2:xs)
  | x1 == a && x2 == b = (a ++ b) : merge (a, b) xs
  | otherwise          = x1 : merge (a, b) (x2:xs)
  
putStrLn "result: "
print (merge ("a", "b") ["a", "b", "c"])
putStrLn "expect: "
print ["ab", "c"]

putStrLn "result: "
print (merge ("a", "b") ["a", "b", "a", "b", "d"])
putStrLn "expect: "
print ["ab", "ab", "d"]

putStrLn "result: "
print (merge ("x", "y") ["a", "b", "c"])
putStrLn "expect: "
print ["a", "b", "c"]


putStrLn "result: "
print (merge ("a", "b") ["a"])
putStrLn "expect: "
print ["a"]


putStrLn "result: "
print (merge ("a", "b") [])
putStrLn "expect: "
print []

putStrLn "result: "
print (merge ("a", "b") ["a", "b", "b", "c"])
putStrLn "expect: "
print ["ab", "b", "c"]




result:

["ab","c"]

expect:

["ab","c"]

result:

["ab","ab","d"]

expect:

["ab","ab","d"]

result:

["a","b","c"]

expect:

["a","b","c"]

result:

["a"]

expect:

["a"]

result:

[]

expect:

[]

result:

["ab","b","c"]

expect:

["ab","b","c"]

In [7]:
merges :: [(String, String)] -> [String] -> [String]
merges [] tokens = tokens
merges (pair:pairs) tokens = merges pairs (merge pair tokens)

putStrLn "\nTest 1 – fusion simple"
print (merges [("a", "b")] ["a", "b", "c"])
print ["ab", "c"]

putStrLn "\nTest 2 – fusion multiple identique"
print (merges [("a", "b")] ["a", "b", "a", "b", "d"])
print ["ab", "ab", "d"]

putStrLn "\nTest 3 – aucune fusion possible"
print (merges [("x", "y")] ["a", "b", "c"])
print ["a", "b", "c"]

putStrLn "\nTest 4 – liste avec un seul élément"
print (merges [("a", "b")] ["a"])
print ["a"]

putStrLn "\nTest 5 – liste vide"
print (merges [("a", "b")] [])
print []

putStrLn "\nTest 6 – fusion + éléments restants"
print (merges [("a", "b")] ["a", "b", "b", "c"])
print ["ab", "b", "c"]

putStrLn "\nTest 7 – deux règles qui s’enchaînent"
print (merges [("a", "b"), ("ab", "c")] ["a", "b", "c", "d"])
print ["abc", "d"]

putStrLn "\nTest 8 – ordre des règles inversé"
print (merges [("ab", "c"), ("a", "b")] ["a", "b", "c", "d"])
print ["ab", "c", "d"] -- car "ab" n'existe pas encore au premier passage

putStrLn "\nTest 9 – fusion récursive sur plusieurs niveaux"
print (merges [("a", "b"), ("b", "c"), ("ab", "c"), ("abc", "d")] ["a", "b", "c"])
print ["abc"] -- ou ["a", "bc"] si certaines règles sont mal ordonnées

putStrLn "\nTest 10 – chaîne très longue"
print (merges [("h", "e"), ("he", "l"), ("hel", "l"), ("hell", "o")] ["h", "e", "l", "l", "o"])
print ["hello"]

putStrLn "\nTest 11 – fusion partielle seulement"
print (merges [("a", "b"), ("x", "y")] ["a", "b", "x", "z"])
print ["ab", "x", "z"]

putStrLn "\nTest 12 – fusion qui ne change rien"
print (merges [] ["a", "b", "c"])
print ["a", "b", "c"]


Test 1 – fusion simple

["ab","c"]

["ab","c"]


Test 2 – fusion multiple identique

["ab","ab","d"]

["ab","ab","d"]


Test 3 – aucune fusion possible

["a","b","c"]

["a","b","c"]


Test 4 – liste avec un seul élément

["a"]

["a"]


Test 5 – liste vide

[]

[]


Test 6 – fusion + éléments restants

["ab","b","c"]

["ab","b","c"]


Test 7 – deux règles qui s’enchaînent

["abc","d"]

["abc","d"]


Test 8 – ordre des règles inversé

["ab","c","d"]

["ab","c","d"]


Test 9 – fusion récursive sur plusieurs niveaux

["abc"]

["abc"]


Test 10 – chaîne très longue

["hello"]

["hello"]


Test 11 – fusion partielle seulement

["ab","x","z"]

["ab","x","z"]


Test 12 – fusion qui ne change rien

["a","b","c"]

["a","b","c"]

In [12]:
import qualified Data.Map as Map

-- Types
type CharMap = Map.Map String Int

-- Fonction pour créer un vocabulaire de test simple


-- VOS FONCTIONS ORIGINALES (exactement comme vous les avez écrites)
changeToIndex :: [String] -> [Int]
changeToIndex =
    map (\c -> Map.findWithDefault 50257 c fullVocab)

reverseMap :: CharMap -> Map.Map Int String
reverseMap vocabMap =
    let vocab = Map.toList vocabMap
        reverseVocab = map (\(a, b) -> (b, a)) vocab
    in Map.fromList reverseVocab

untokenizer :: [Int] -> CharMap -> String
untokenizer tokensId strKey =
    let idsKey = reverseMap strKey
        find = concat (map (\i -> Map.findWithDefault "?" i idsKey) tokensId)
        result = T.unpack (T.replace (T.pack "Ġ") (T.pack " ") (T.pack find))
    in result

-- Fonction pour tokenizer simple - traite chaque string comme un seul token
simpleTokenize :: String -> [String]
simpleTokenize s = [s]  -- Traite toute la string comme un seul token

-- Tests pour différentes langues
testEnglish :: IO ()
testEnglish = do
    putStrLn "=== TEST ANGLAIS ==="
    let text = "hello world"
    let tokens = makeStrArray (replaceSpace text)
    let tokenIds = changeToIndex tokens
    let reconstructed = untokenizer tokenIds fullVocab
    
    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Reconstructed: " ++ show reconstructed
    putStrLn $ "Match: " ++ show (text == reconstructed)
    putStrLn ""

testFrench :: IO ()
testFrench = do
    putStrLn "=== TEST FRANÇAIS ==="
    let text = "bonjour le monde"
    let tokens = makeStrArray (replaceSpace text)
    let tokenIds = changeToIndex tokens
    let reconstructed = untokenizer tokenIds fullVocab
    
    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Reconstructed: " ++ show reconstructed
    putStrLn $ "Match: " ++ show (text == reconstructed)
    putStrLn ""

testChinese :: IO ()
testChinese = do
    putStrLn "=== TEST CHINOIS ==="
    let text = "你好 世界"
    let tokens = makeStrArray (replaceSpace text)
    let tokenIds = changeToIndex tokens
    let reconstructed = untokenizer tokenIds fullVocab
    
    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens  
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Reconstructed: " ++ show reconstructed
    putStrLn $ "Match: " ++ show (text == reconstructed)
    putStrLn ""

testMixed :: IO ()
testMixed = do
    putStrLn "=== TEST MIXTE ==="
    let text = "hello bonjour 你好"
    let tokens = makeStrArray (replaceSpace text)
    let tokenIds = changeToIndex tokens
    let reconstructed = untokenizer tokenIds fullVocab
    
    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Reconstructed: " ++ show reconstructed
    putStrLn $ "Match: " ++ show (text == reconstructed)
    putStrLn ""
    
testChar :: IO ()
testChar = do
    putStrLn "=== TEST SPECIAL CHARACTER ==="
    let text = "é & à è ê â û"
    let tokens = makeStrArray (replaceSpace text)
    let tokenIds = changeToIndex tokens
    let reconstructed = untokenizer tokenIds fullVocab
    
    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Reconstructed: " ++ show reconstructed
    putStrLn $ "Match: " ++ show (text == reconstructed)
    putStrLn ""


testRoundTrip :: IO ()
testRoundTrip = do
    putStrLn "=== TEST ROUND-TRIP (aller-retour) ==="
    let texts = [
            "hello world",
            "bonjour monde", 
            "你好 世界",
            "hello bonjour 你好",
            "é & à è ê â û"]
    
    mapM_ (\text -> do
        let tokens = makeStrArray (replaceSpace text)
        let tokenIds = changeToIndex tokens
        let reconstructed = untokenizer tokenIds fullVocab
        let success = text == reconstructed
        putStrLn $ text ++ " -> " ++ reconstructed ++ " [" ++ 
                  (if success then "✓" else "✗") ++ "]"
        ) texts
    putStrLn ""

-- Test principal
main :: IO ()
main = do
    putStrLn "TESTS DE TOKENIZATION MULTILINGUE"
    putStrLn "=================================="
    putStrLn ""
    
    testEnglish
    testFrench
    testChinese
    testMixed
    testChar
    testRoundTrip


main

TESTS DE TOKENIZATION MULTILINGUE

=== TEST ANGLAIS ===
Texte original: "hello world"
Tokens: ["h","e","l","l","o","\288","w","o","r","l","d"]
Token IDs: [71,68,75,75,78,220,86,78,81,75,67]
Reconstructed: "hello world"
Match: True

=== TEST FRANÇAIS ===
Texte original: "bonjour le monde"
Tokens: ["b","o","n","j","o","u","r","\288","l","e","\288","m","o","n","d","e"]
Token IDs: [65,78,77,73,78,84,81,220,75,68,220,76,78,77,67,68]
Reconstructed: "bonjour le monde"
Match: True

=== TEST CHINOIS ===
Texte original: "\20320\22909 \19990\30028"
Tokens: ["\20320","\22909","\288","\19990","\30028"]
Token IDs: [50257,50257,220,50257,50257]
Reconstructed: "smileysmiley smileysmiley"
Match: False

=== TEST MIXTE ===
Texte original: "hello bonjour \20320\22909"
Tokens: ["h","e","l","l","o","\288","b","o","n","j","o","u","r","\288","\20320","\22909"]
Token IDs: [71,68,75,75,78,220,65,78,77,73,78,84,81,220,50257,50257]
Reconstructed: "hello bonjour smileysmiley"
Match: False

=== TEST SPECIAL CHARACT

In [24]:
import qualified Data.Map as Map

-- Types
type CharMap = Map.Map String Int


-- Tests pour différentes langues
testEnglish :: IO ()
testEnglish = do
    putStrLn "=== TEST ANGLAIS ==="
    let text = "hello world"
    let tokens = makeStrArray (replaceSpace text)
    let merged = merges pairs tokens
    let tokenIds = changeToIndex merged
    let tokenGPT2 = ["hello", "Ġworld"]
    
    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens
    putStrLn $ "Merged: " ++ show merged
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Token GPT2: " ++ show tokenGPT2
    putStrLn $ "Match: " ++ show (merged == tokenGPT2)
    putStrLn ""

testFrench :: IO ()
testFrench = do
    putStrLn "=== TEST FRANÇAIS ==="
    let text = "bonjour le monde"
    let tokens = makeStrArray (replaceSpace text)
    let merged = merges pairs tokens
    let tokenIds = changeToIndex merged
    let tokenGPT2 = ["bon", "j", "our", "Ġle", "Ġm", "onde"]
    
    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens
    putStrLn $ "Merged: " ++ show merged
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Token GPT2: " ++ show tokenGPT2    
    putStrLn $ "Match: " ++ show (merged == tokenGPT2)
    putStrLn ""

testChinese :: IO ()
testChinese = do
    putStrLn "=== TEST CHINOIS ==="
    let text = "你好 世界"
    let tokens = makeStrArray (replaceSpace text)
    let merged = merges pairs tokens
    let tokenIds = changeToIndex merged
    let tokenGPT2 = ["ä½", "ł", "å¥", "½", "Ġ", "ä¸", "ĸ", "çķ", "Į"]
    
    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens  
    putStrLn $ "Merged: " ++ show merged
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Token GPT2: " ++ show tokenGPT2    
    putStrLn $ "Match: " ++ show (merged == tokenGPT2)
    putStrLn ""

testMixed :: IO ()
testMixed = do
    putStrLn "=== TEST MIXTE ==="
    let text = "hello bonjour 你好"
    let tokens = makeStrArray (replaceSpace text)
    let merged = merges pairs tokens
    let tokenIds = changeToIndex merged
    let tokenGPT2 = ["hello", "Ġbon", "j", "our", "Ġ", "ä½", "ł", "å¥", "½"]
    
    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens
    putStrLn $ "Merged: " ++ show merged
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Token GPT2: " ++ show tokenGPT2    
    putStrLn $ "Match: " ++ show (merged == tokenGPT2)
    putStrLn ""
    
testChar :: IO ()
testChar = do
    putStrLn "=== TEST SPECIAL CHARACTER ==="
    let text = "é & à è ê â û"
    let tokens = makeStrArray (replaceSpace text)
    let merged = merges pairs tokens
    let tokenIds = changeToIndex merged
    let tokenGPT2 = ["Ã©", "Ġ&", "ĠÃł", "ĠÃ", "¨", "ĠÃ", "ª", "ĠÃ", "¢", "ĠÃ", "»"]

    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens
    putStrLn $ "Merged: " ++ show merged
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Token GPT2: " ++ show tokenGPT2
    putStrLn $ "Match: " ++ show (merged == tokenGPT2)
    putStrLn ""

testLongAccent :: IO ()
testLongAccent = do
    putStrLn "=== TEST LONG TEXTE ==="
    let text = "Ce matin, j’ai été à l’école avec Élodie. Il faisait beau et le ciel était d’un bleu profond. À la pause, nous avons bu un café très fort. C’était vraiment une belle journée d’été!"
    let tokens = makeStrArray (replaceSpace text)
    let merged = merges pairs tokens
    let tokenIds = changeToIndex merged
    let tokenGPT2 = ["C", "e", "Ġmat", "in", ",", "Ġj", "âĢ", "Ļ", "ai", "Ġ", "Ã©t", "Ã©", "ĠÃł", "Ġl", "âĢ", "Ļ", "Ã©", "co", "le", "Ġa", "vec", "ĠÃī", "l", "od", "ie", ".", "ĠIl", "Ġf", "ais", "ait", "Ġbe", "au", "Ġet", "Ġle", "Ġc", "iel", "Ġ", "Ã©t", "ait", "Ġd", "âĢ", "Ļ", "un", "Ġble", "u", "Ġprof", "ond", ".", "ĠÃ", "Ģ", "Ġla", "Ġpause", ",", "Ġn", "ous", "Ġav", "ons", "Ġbu", "Ġun", "ĠcafÃ©", "Ġtr", "Ã¨", "s", "Ġfort", ".", "ĠC", "âĢ", "Ļ", "Ã©t", "ait", "Ġv", "ra", "iment", "Ġune", "Ġbel", "le", "j", "ourn", "Ã©e", "Ġd", "âĢ", "Ļ", "Ã©t", "Ã©", "!"]

    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens
    putStrLn $ "Merged: " ++ show merged
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Token GPT2: " ++ show tokenGPT2
    putStrLn $ "Match: " ++ show (merged == tokenGPT2)
    putStrLn ""
    
testLong :: IO ()
testLong = do
    putStrLn "=== TEST LONG TEXTE SANS ACCENT==="
    let text = "Ce matin, j'ai ete a l'ecole avec Elodie. Il faisait beau et le ciel etait d'un bleu profond. A la pause, nous avons bu un cafe tres fort. C'etait vraiment une belle journee d'ete!"
    let tokens = makeStrArray (replaceSpace text)
    let merged = merges pairs tokens
    let tokenIds = changeToIndex merged
    let tokenGPT2 = ["C", "e", "Ġmat", "in", ",", "Ġj", "'", "ai", "Ġe", "te", "Ġa", "Ġl", "'", "ec", "ole", "Ġa", "vec", "ĠEl", "od", "ie", ".", "ĠIl", "Ġf", "ais", "ait", "Ġbe", "au", "Ġet", "Ġle", "Ġc", "iel", "Ġet", "ait", "Ġd", "'", "un", "Ġble", "u", "Ġprof", "ond", ".",  "ĠA", "Ġla", "Ġpause", ",", "Ġn", "ous", "Ġav", "ons", "Ġbu", "Ġun", "Ġcafe", "Ġt", "res", "Ġfort", ".",  "ĠC", "'", "et", "ait", "Ġv", "ra", "iment", "Ġune", "Ġbel", "le", "Ġjour", "nee", "Ġd", "'", "ete", "!"]

    putStrLn $ "Texte original: " ++ show text
    putStrLn $ "Tokens: " ++ show tokens
    putStrLn $ "Merged: " ++ show merged
    putStrLn $ "Token IDs: " ++ show tokenIds
    putStrLn $ "Token GPT2: " ++ show tokenGPT2
    putStrLn $ "Match: " ++ show (merged == tokenGPT2)
    putStrLn ""
    
    
-- Test principal
main :: IO ()
main = do
    putStrLn "TESTS DE TOKENIZATION MULTILINGUE"
    putStrLn "=================================="
    putStrLn ""
    
    testEnglish
    testFrench
    testChinese
    testMixed
    testChar
    testLongAccent
    testLong
    testRoundTrip


main

TESTS DE TOKENIZATION MULTILINGUE

=== TEST ANGLAIS ===
Texte original: "hello world"
Tokens: ["h","e","l","l","o","\288","w","o","r","l","d"]
Merged: ["hello","\288world"]
Token IDs: [31373,995]
Token GPT2: ["hello","\288world"]
Match: True

=== TEST FRANÇAIS ===
Texte original: "bonjour le monde"
Tokens: ["b","o","n","j","o","u","r","\288","l","e","\288","m","o","n","d","e"]
Merged: ["bon","j","our","\288le","\288m","onde"]
Token IDs: [4189,73,454,443,285,14378]
Token GPT2: ["bon","j","our","\288le","\288m","onde"]
Match: True

=== TEST CHINOIS ===
Texte original: "\20320\22909 \19990\30028"
Tokens: ["\20320","\22909","\288","\19990","\30028"]
Merged: ["\20320","\22909","\288","\19990","\30028"]
Token IDs: [50257,50257,220,50257,50257]
Token GPT2: ["\228\189","\322","\229\165","\189","\288","\228\184","\312","\231\311","\302"]
Match: False

=== TEST MIXTE ===
Texte original: "hello bonjour \20320\22909"
Tokens: ["h","e","l","l","o","\288","b","o","n","j","o","u","r","\288","\20320","