Test data files

wordslab-org · Apr 1, 2024 · 3d5c452 · 3d5c452
1 parent de4b905
commit 3d5c452
Show file tree

Hide file tree

Showing 8 changed files with 219 additions and 129 deletions.
diff --git a/_proc/index.ipynb b/_proc/index.ipynb
@@ -66,15 +66,18 @@
    },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "wordslab_llms models\n"
-     ]
+     "data": {
+      "text/plain": [
+       "mistral_7b: mistralai/Mistral-7B-v0.1 => params: 7.3 B | disk: 13.49 GB | vram: 14.324 GB (16 bits)"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "test()"
+    "base_models['mistral_7b']"
    ]
   },
   {

diff --git a/nbs/00_models.ipynb b/nbs/00_models.ipynb
@@ -45,21 +45,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-04-01T10:36:32.293423Z",
-     "iopub.status.busy": "2024-04-01T10:36:32.281748Z",
-     "iopub.status.idle": "2024-04-01T10:36:32.319374Z",
-     "shell.execute_reply": "2024-04-01T10:36:32.318903Z",
-     "shell.execute_reply.started": "2024-04-01T10:36:32.293405Z"
-    },
-    "tags": []
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [],
    "source": [
     "#| export\n",
-    "import csv\n",
+    "import os, csv\n",
+    "from pathlib import Path\n",
     "from datetime import datetime, timedelta\n",
     "\n",
     "base_models = {}\n",
@@ -71,8 +63,11 @@
     "    def __repr__(self):\n",
     "        return f\"{self.name}: {self.huggingface_repo} => params: {self.params_B} B | disk: {self.disk_size_GB} GB | vram: {self.memory_size_MB/1000} GB ({self.precision} bits)\"\n",
     "\n",
-    "    def print_identification_properties(self):\n",
-    "        print(f\"{self.name}: {self.huggingface_repo}\")\n",
+    "    def print_name_and_url(self):\n",
+    "        print(f\"{self.name}: https://huggingface.co/{self.huggingface_repo}\")\n",
+    "    \n",
+    "    def print_model_card(self, print_name=True):\n",
+    "        if print_name: self.print_name_and_url()\n",
     "        if self.is_best_model:\n",
     "            print(f\"** Best model - languages performance: {self.languages_perf} **\")\n",
     "        if self.moe_activated_params_B == 0:\n",
@@ -86,17 +81,38 @@
     "        print(f\"- model weights license: {self.license}\")\n",
     "        print(f\"- publication date: {self.date}\")\n",
     "        \n",
-    "    def print_download_on_disk_properties(self):\n",
+    "    def print_download_properties(self, print_name=True):\n",
     "        print(f\"{self.name}: {self.huggingface_repo}\")\n",
+    "        if self.gated_access:\n",
+    "            print(f\"** WARNING - Gated access: you need to request access on the Huggingface website **\")\n",
+    "        print(f\"- huggingface repo: {self.huggingface_repo}\")\n",
+    "        if self.gated_access:\n",
+    "            print(f\"- huggingface read access token: mandatory\")\n",
+    "        print(f\"- disk size: {self.disk_size_GB} GB\")\n",
+    "        if self.safetensors:\n",
+    "            print(f\"- weights format: Huggingface safetensors\")\n",
+    "        else:\n",
+    "            print(f\"- weights format: Pytorch .bin (pickle)\")\n",
+    "        if self.install_commands:\n",
+    "            print(\"- model dependencies installation commands\")\n",
+    "            self.print_install_commands(line_prefix=\"  - \")\n",
+    "            \n",
+    "    def print_install_commands(self, line_prefix=\"- \"):\n",
+    "        for line in self.install_commands.splitlines():\n",
+    "            print(line_prefix + line)\n",
     "        \n",
-    "    def print_load_in_memory_properties(self):\n",
+    "    def print_load_properties(self, print_name=True):\n",
     "        print(f\"{self.name}: {self.huggingface_repo}\")\n",
     "        \n",
-    "    def print_perplexity_properties(self):\n",
+    "    def print_perplexity_test(self, print_name=True):\n",
     "        print(f\"{self.name}: {self.huggingface_repo}\")\n",
     "    \n",
+    "try:\n",
+    "    libdata_path = Path(__file__).parent / \"data\"\n",
+    "except NameError:\n",
+    "    libdata_path = Path(os.getcwd()).parent / \"wordslab_llms\" / \"data\"\n",
     "    \n",
-    "with open('base_models.csv', 'r') as file:\n",
+    "with open(libdata_path / 'base_models.csv', 'r') as file:\n",
     "    csv_reader = csv.reader(file)\n",
     "    # Skip first line with column titles\n",
     "    next(csv_reader)\n",
@@ -198,17 +214,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-04-01T10:36:32.910090Z",
-     "iopub.status.busy": "2024-04-01T10:36:32.909674Z",
-     "iopub.status.idle": "2024-04-01T10:36:32.922029Z",
-     "shell.execute_reply": "2024-04-01T10:36:32.921499Z",
-     "shell.execute_reply.started": "2024-04-01T10:36:32.910063Z"
-    },
-    "tags": []
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
@@ -251,10 +258,10 @@
       "solar_10b: upstage/SOLAR-10.7B-v1.0 => params: 10.7 B | disk: 19.99 GB | vram: 10.588 GB (8 bits)\n",
       "qwen1.5_14b: Qwen/Qwen1.5-14B => params: 14.0 B | disk: 26.4 GB | vram: 0.0 GB (8 bits)\n",
       "qwen1.5_moe_14bx2b: Qwen/Qwen1.5-MoE-A2.7B => params: 14.3 B | disk: 26.68 GB | vram: 14.901 GB (8 bits)\n",
+      "internlm2_20b: internlm/internlm2-20b => params: 20.0 B | disk: 37.0 GB | vram: 20.803 GB (8 bits)\n",
       "mpt_30b: mosaicml/mpt-30b => params: 30.0 B | disk: 55.8 GB | vram: 16.567 GB (4 bits)\n",
       "codellama_34b: codellama/CodeLlama-34b-hf => params: 34.0 B | disk: 62.86 GB | vram: 19.218 GB (4 bits)\n",
       "yi_34b: 01-ai/Yi-34B => params: 34.0 B | disk: 64.06 GB | vram: 0.0 GB (4 bits)\n",
-      "internlm2_20b: internlm/internlm2-20b => params: 20.0 B | disk: 0.0 GB | vram: 0.0 GB (8 bits)\n",
       "command-r_35b: CohereForAI/c4ai-command-r-v01-4bit => params: 35.0 B | disk: 21.15 GB | vram: 21.76 GB (4 bits)\n",
       "falcon_40b: tiiuae/falcon-40b => params: 40.0 B | disk: 77.93 GB | vram: 0.0 GB (4 bits)\n",
       "alfred_40b: lightonai/alfred-40b-1023 => params: 40.0 B | disk: 77.93 GB | vram: 22.889 GB (4 bits)\n",
@@ -273,7 +280,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "`ModelInfo` attributes - Model identification: \n",
+    "`ModelInfo` attributes - **Model card**\n",
     "- `name`: simplified id to easily reference the model '(model)*(version)*_ (params)*_(contextlength)*'\n",
     "- `is_best_model`: True if the model is one of the bests among models of similar size, either for performance, speed, or openness (in a first approach, you can ignore all other models)\n",
     "- `languages_perf`: \n",
@@ -282,29 +289,20 @@
     "- `context_size`: maximum sequence length natively supported\n",
     "- `vocabulary`: number of token types in the vocabulary of the tokenizer\n",
     "- `training_tokens_T`: when the information was disclosed, number of tokens on which the base model was trained (in trillions) \n",
-    "- `license`: license for **the weights** of the model\n",
+    "- `license`: license for the weightsµ of the model\n",
     "- `date`: model publication date"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-04-01T10:36:34.289022Z",
-     "iopub.status.busy": "2024-04-01T10:36:34.277805Z",
-     "iopub.status.idle": "2024-04-01T10:36:34.292804Z",
-     "shell.execute_reply": "2024-04-01T10:36:34.292284Z",
-     "shell.execute_reply.started": "2024-04-01T10:36:34.288984Z"
-    },
-    "tags": []
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "mistral_7b: mistralai/Mistral-7B-v0.1\n",
+      "mistral_7b: https://huggingface.co/mistralai/Mistral-7B-v0.1\n",
       "** Best model - languages performance: EN>ES,FR,DE **\n",
       "- parameters: 7.3 B\n",
       "- context size: 8192 tokens\n",
@@ -315,47 +313,38 @@
     }
    ],
    "source": [
-    "base_models['mistral_7b'].print_identification_properties()"
+    "base_models['mistral_7b'].print_model_card()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-04-01T10:33:27.759682Z",
-     "iopub.status.busy": "2024-04-01T10:33:27.758833Z",
-     "iopub.status.idle": "2024-04-01T10:33:27.768676Z",
-     "shell.execute_reply": "2024-04-01T10:33:27.767768Z",
-     "shell.execute_reply.started": "2024-04-01T10:33:27.759643Z"
-    },
-    "tags": []
-   },
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "mixtral_moe_46bx13b_hqq3bit: mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-3bit-metaoffload-HQQ\n",
+      "mixtral_moe_46bx13b_hqq3bit: https://huggingface.co/mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-3bit-metaoffload-HQQ\n",
       "** Best model - languages performance: multilingual **\n",
       "- activated parameters: 12.9 B, total parameters: 46.7 B (mixture of experts)\n",
       "- context size: 32768 tokens\n",
       "- vocabulary: 32000 token types\n",
-      "- training tokens: 0.0 T\n",
       "- model weights license: Apache 2.0\n",
       "- publication date: 2024-02-29\n"
      ]
     }
    ],
    "source": [
-    "base_models['mixtral_moe_46bx13b_hqq3bit'].print_identification_properties()"
+    "base_models['mixtral_moe_46bx13b_hqq3bit'].print_model_card()"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "`ModelInfo` attributes - Download model weights and code from the Huggingface hub:\n",
+    "`ModelInfo` attributes - **Download model** weights and code from the Huggingface hub\n",
+    "\n",
     "- `huggingface_repo`: path of the model in the Huggingface hub, append 'https://huggingface.co/' at the beginning to get the repo URL\n",
     "- `disk_size_GB`: total files size (in GB) which will be downloaded and stored on your disk in the Huggingface models cache directory (see 'HF_HOME' environment variable)\n",
     "- `gated_access`: if True, you will need to be granted access before you can download the model\n",
@@ -377,14 +366,53 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "stablelm_3b: stabilityai/stablelm-3b-4e1t\n",
+      "** WARNING - Gated access: you need to request access on the Huggingface website **\n",
+      "- huggingface repo: stabilityai/stablelm-3b-4e1t\n",
+      "- huggingface read access token: mandatory\n",
+      "- disk size: 5.21 GB\n",
+      "- weights format: Huggingface safetensors\n"
+     ]
+    }
+   ],
+   "source": [
+    "base_models['stablelm_3b'].print_download_properties()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mixtral_moe_46bx13b_hqq3bit: mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-3bit-metaoffload-HQQ\n",
+      "- huggingface repo: mobiuslabsgmbh/Mixtral-8x7B-Instruct-v0.1-hf-attn-4bit-moe-3bit-metaoffload-HQQ\n",
+      "- disk size: 20.88 GB\n",
+      "- weights format: Huggingface safetensors\n",
+      "- model dependencies installation commands\n",
+      "  - pip install --upgrade hqq\n",
+      "  - git clone https://github.com/mobiusml/hqq/\n",
+      "  - cd hqq/hqq/kernels && python setup_cuda.py install\n"
+     ]
+    }
+   ],
+   "source": [
+    "base_models['mixtral_moe_46bx13b_hqq3bit'].print_download_properties()"
+   ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "`ModelInfo` attributes - Load model in memory with Huggingface transformers:\n",
+    "`ModelInfo` attributes - **Load model** in memory with Huggingface transformers\n",
     "- `tokenizer_remote_code`: False if the tokenizer code is included in the Hugginface transformers library, True if you need to trust the tokenizer code downloaded from the model repository\n",
     "- `vocab_size_code: additional code to execute after load to set the tokenizer vocabulary size\n",
     "- `padding_token_code`: additional code to execute after load to set the tokenizer padding token\n",
@@ -394,35 +422,45 @@
     "- `model_load_dtype`: specific torch.dtype to use to load the model without wasting memory (for example fp16 is the model weights are saved in fp32)\n",
     "- `quantization_type`: quantization algorithm to use to get the best performance on a local machine with a limited amount of VRAM\n",
     "- `precision`: number of bits per parameter after quantization, average rounded to the closest integer  \n",
-    "- `memory_size_MB`: VRAM necessary just to load the model in memory (in MB)\n",
-    "\n",
-    "`ModelInfo` attributes - Performance tests on real-world business language in English, French, German, and Spanish (retail banking websites, approx 10 millions tokens per language)\n",
+    "- `memory_size_MB`: VRAM necessary just to load the model in memory (in MB)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`ModelInfo` attributes - **Performance tests** on real-world business language in English, French, German, and Spanish (retail banking websites, approx 10 millions tokens per language)\n",
     "- `ppl_sequence_length`: max sequence length used to measure perplexity (optimized for 24 GB VRAM)\n",
     "- `ppl_batch_size`:  batch size used during the perplexity test (optimized for 24 GB VRAM)\n",
     "- `ppl_memory_size_MB`: total VRAM used during the perplexity test (in MB)\n",
     "- `fr_tokens_M`: FRENCH dataset - millions of tokens tested\n",
-    "- `fr_tokens_duration`: FRENCH dataset - duration to tokenize the dataset (timedelta object)\n",
+    "- `fr_tokens_duration`: FRENCH dataset - time to tokenize the dataset (timedelta object)\n",
     "- `fr_pplu_x1000`: FRENCH dataset - unigram-normalized perplexity (x1000)\n",
     "- `fr_ppl`: FRENCH dataset - perplexity\n",
-    "- `fr_ppl_duration`: FRENCH dataset - duration to compute the model perplexity (timedelta object)\n",
+    "- `fr_ppl_duration`: FRENCH dataset - time to compute the model perplexity (timedelta object)\n",
     "- `en_tokens_M` / `en_tokens_duration` / `en_pplu_x1000` / `en_ppl` / `en_ppl_duration`: same metrics for the ENGLISH dataset \n",
     "- `de_tokens_M` / `de_tokens_duration` / `de_pplu_x1000` / `de_ppl` / `de_ppl_duration`: same metrics for the GERMAN dataset \n",
     "- `es_tokens_M` / `es_tokens_duration` / `es_pplu_x1000` / `es_ppl` / `es_ppl_duration`: same metrics for the SPANISH dataset"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {
-    "execution": {
-     "iopub.execute_input": "2024-03-30T14:14:53.551802Z",
-     "iopub.status.busy": "2024-03-30T14:14:53.551454Z",
-     "iopub.status.idle": "2024-03-30T14:14:53.560662Z",
-     "shell.execute_reply": "2024-03-30T14:14:53.560168Z",
-     "shell.execute_reply.started": "2024-03-30T14:14:53.551787Z"
-    },
-    "tags": []
-   },
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
    "outputs": [
     {
      "ename": "SyntaxError",
@@ -459,18 +497,6 @@
    "display_name": "wordslab-llms-lib",
    "language": "python",
    "name": "wordslab-llms-lib"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/nbs/base_models.xlsx b/nbs/base_models.xlsx