From 312193ac757d7382577ffe05efe3c93195685134 Mon Sep 17 00:00:00 2001 From: Maike Reis Date: Wed, 5 Feb 2025 18:46:55 -0300 Subject: [PATCH] Fix tokenizer files search on HunggingFace --- unsloth/models/loader.py | 46 ++++++++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/unsloth/models/loader.py b/unsloth/models/loader.py index e9caad0e60..183cf6511f 100644 --- a/unsloth/models/loader.py +++ b/unsloth/models/loader.py @@ -245,15 +245,22 @@ def from_pretrained( ) pass - # Check if this is local model since the tokenizer gets overwritten - if os.path.exists(os.path.join(old_model_name, "tokenizer_config.json")) and \ - os.path.exists(os.path.join(old_model_name, "tokenizer.json")) and \ - os.path.exists(os.path.join(old_model_name, "special_tokens_map.json")): - - tokenizer_name = old_model_name + # Check if this is a local model or on Hugging Face + if os.path.isdir(old_model_name): # Check if old_model_name is a local directory + # Check for tokenizer files locally + tokenizer_files_exist = all( + os.path.exists(os.path.join(old_model_name, file)) + for file in ["tokenizer_config.json", "tokenizer.json", "special_tokens_map.json"] + ) + tokenizer_name = old_model_name if tokenizer_files_exist else None else: - tokenizer_name = None - pass + # Check for tokenizer files on Hugging Face + fs = HfFileSystem(token=token) + tokenizer_files_exist = all( + fs.exists(f"{old_model_name}/{file}") + for file in ["tokenizer_config.json", "tokenizer.json", "special_tokens_map.json"] + ) + tokenizer_name = old_model_name if tokenizer_files_exist else None model, tokenizer = dispatch_model.from_pretrained( model_name = model_name, @@ -500,15 +507,22 @@ def from_pretrained( pass if do_logging: redirector.close() - # Check if this is local model since the tokenizer gets overwritten - if os.path.exists(os.path.join(old_model_name, "tokenizer_config.json")) and \ - os.path.exists(os.path.join(old_model_name, "tokenizer.json")) and \ - os.path.exists(os.path.join(old_model_name, "special_tokens_map.json")): - - tokenizer_name = old_model_name + # Check if this is a local model or on Hugging Face + if os.path.isdir(old_model_name): # Check if old_model_name is a local directory + # Check for tokenizer files locally + tokenizer_files_exist = all( + os.path.exists(os.path.join(old_model_name, file)) + for file in ["tokenizer_config.json", "tokenizer.json", "special_tokens_map.json"] + ) + tokenizer_name = old_model_name if tokenizer_files_exist else None else: - tokenizer_name = None - pass + # Check for tokenizer files on Hugging Face + fs = HfFileSystem(token=token) + tokenizer_files_exist = all( + fs.exists(f"{old_model_name}/{file}") + for file in ["tokenizer_config.json", "tokenizer.json", "special_tokens_map.json"] + ) + tokenizer_name = old_model_name if tokenizer_files_exist else None model, tokenizer = FastBaseVisionModel.from_pretrained( model_name = model_name,