diff --git a/tasks/incentive_classifier/notebooks/BinaryClassifierGoogleColab.ipynb b/tasks/incentive_classifier/notebooks/BinaryClassifierGoogleColab.ipynb index 114409e8..f6079d70 100644 --- a/tasks/incentive_classifier/notebooks/BinaryClassifierGoogleColab.ipynb +++ b/tasks/incentive_classifier/notebooks/BinaryClassifierGoogleColab.ipynb @@ -1,1436 +1,1462 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pqEZnDaKkKzF" - }, - "outputs": [], - "source": [ - "! pip install \\\n", - " scprep\\\n", - " spacy==2.3.2 \\\n", - " sentence_transformers==0.4.0 \\\n", - " phate==1.0.4 && \\\n", - " python -m spacy download es_core_news_lg" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H41XHYlrkctL" - }, - "source": [ - "WARNING! Once you installed the packages in the previous cell you must restart your runtime and then import the library and load the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "RWngKIdpkPPg", - "outputId": "84e989ef-8d85-44bd-8e13-e90aae903469" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using the GPU\n" - ] - } - ], - "source": [ - "import spacy\n", - "if spacy.prefer_gpu():\n", - " print(\"Using the GPU\")\n", - "else:\n", - " print(\"Using the CPU\")\n", - "es_nlp = spacy.load('es_core_news_lg')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "55q23OCMkvD5" - }, - "source": [ - "For development work, in case you want to update the files in your GitHub branch by rerunning the clone, you first have to empty the folder." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Sr1ACGUikx5K" - }, - "outputs": [], - "source": [ - "!rm -rf policy-data-analyzer/" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FDSRezbdkxyI", - "outputId": "7594b505-daf5-4489-e872-89171aaadb8d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cloning into 'policy-data-analyzer'...\n", - "remote: Enumerating objects: 498, done.\u001b[K\n", - "remote: Counting objects: 100% (498/498), done.\u001b[K\n", - "remote: Compressing objects: 100% (319/319), done.\u001b[K\n", - "remote: Total 2900 (delta 318), reused 337 (delta 178), pack-reused 2402\u001b[K\n", - "Receiving objects: 100% (2900/2900), 126.76 MiB | 26.37 MiB/s, done.\n", - "Resolving deltas: 100% (1476/1476), done.\n", - "Checking out files: 100% (843/843), done.\n" - ] - } - ], - "source": [ - "# Define branch to clone\n", - "! branch_name='#50_dfq_sbert_fine_tuning' && \\\n", - " git clone --branch $branch_name https://github.com/wri-dssg/policy-data-analyzer.git" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "56PH58KwkxsS", - "outputId": "533d36b7-e741-4e38-a3ed-f130f04bb4cd" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Mounted at /content/drive\n" - ] - } - ], - "source": [ - "import pandas as pd\n", - "import sys\n", - "import os\n", - "import csv\n", - "from sklearn.model_selection import train_test_split\n", - "from sentence_transformers import SentencesDataset, SentenceTransformer, InputExample, losses\n", - "from sentence_transformers.evaluation import LabelAccuracyEvaluator\n", - "from torch import nn, Tensor\n", - "from typing import Iterable, Dict\n", - "from torch.utils.data import DataLoader\n", - "import math\n", - "import time\n", - "import cupy as cp\n", - "\n", - "# os.chdir(\"policy-data-analyzer\") #If you run this cell more than once, comment out this line because you are ready in this folder and you will get an error\n", - "from tasks.data_loader.src.utils import *\n", - "from tasks.data_augmentation.src.zero_shot_classification.latent_embeddings_classifier import *\n", - "from tasks.evaluate_model.src.model_evaluator import *\n", - "from tasks.data_visualization.src.plotting import *\n", - "\n", - "from google.colab import drive\n", - "drive.mount('/content/drive')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0E6y9u8rmllu" - }, - "source": [ - "## 1. Fine tune SBERT on Binary Classification" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uEEBX_uPnQRN" - }, - "source": [ - "Similar setup for fine tuning multi-class. Maybe we should have this section in the other notebook, and in this one we just load the model? \n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zWb8sqiXkxme" - }, - "outputs": [], - "source": [ - "class SoftmaxClassifier(nn.Module):\n", - " \"\"\"\n", - " This loss adds a softmax classifier on top of the output of the transformer network. \n", - " It takes a sentence embedding and learns a mapping between it and the corresponding category.\n", - " :param model: SentenceTransformer model\n", - " :param sentence_embedding_dimension: Dimension of your sentence embeddings\n", - " :param num_labels: Number of different labels\n", - " \"\"\"\n", - " def __init__(self,\n", - " model: SentenceTransformer,\n", - " sentence_embedding_dimension: int,\n", - " num_labels: int):\n", - " super(SoftmaxClassifier, self).__init__()\n", - " self.model = model\n", - " self.num_labels = num_labels\n", - " self.classifier = nn.Linear(sentence_embedding_dimension, num_labels)\n", - "\n", - " def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):\n", - " # Get batch sentence embeddings\n", - " features = self.model(sentence_features[0])['sentence_embedding']\n", - " \n", - " # Get batch loss\n", - " output = self.classifier(features)\n", - " loss_fct = nn.CrossEntropyLoss()\n", - "\n", - " if labels is not None:\n", - " loss = loss_fct(output, labels.view(-1))\n", - " return loss\n", - " else:\n", - " return features, output" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "plcg1u8kqCCR" - }, - "source": [ - "### Load data\n", - "For now, EXP9 will be binary classification" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "liOnrCm-kCVg" - }, - "outputs": [], - "source": [ - "rater = \"Rater3\" # TODO: Change accordingly to what is the dataset you want to analyze\n", - "Experiment = \"EXP9\" \n", - "\n", - "# This first one is the one used by David and Daniel\n", - "path = \"/content/drive/MyDrive/WRI-LatinAmerica-Talent/Cristina_Policy_Files/Tagged_sentence_lists/datasets/\"\n", - "# This one is the one used by Jordi\n", - "# path = \"/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Cristina_Policy_Files/Tagged_sentence_lists/datasets/\"\n", - "\n", - "model_names = ['stsb-xlm-r-multilingual', 'paraphrase-xlm-r-multilingual-v1']#, 'quora-distilbert-multilingual''distiluse-base-multilingual-cased-v2', " - ] + "name": "BinaryClassifierGoogleColab.ipynb", + "provenance": [], + "collapsed_sections": [], + "machine_shape": "hm" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" + "cells": [ + { + "cell_type": "code", + "metadata": { + "id": "pqEZnDaKkKzF" + }, + "source": [ + "! pip install \\\n", + " scprep\\\n", + " spacy==2.3.2 \\\n", + " sentence_transformers==0.4.0 \\\n", + " phate==1.0.4 && \\\n", + " python -m spacy download es_core_news_lg" + ], + "execution_count": null, + "outputs": [] }, - "id": "CnN_0CAVkJRt", - "outputId": "4d822392-3d5a-4956-a61a-20ffc9dcf5ee" - }, - "outputs": [ { - "data": { - "text/plain": [ - "['not_Incentive', 'Incentive']" + "cell_type": "markdown", + "metadata": { + "id": "H41XHYlrkctL" + }, + "source": [ + "WARNING! Once you installed the packages in the previous cell you must restart your runtime and then import the library and load the model" ] - }, - "execution_count": 18, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "filename = \"dataset_\" + rater + \"_incentive_sentences.csv\"\n", - "file = path + filename\n", - "with open(file, newline='') as f:\n", - " reader = csv.reader(f)\n", - " all_sents = list(reader)[0]\n", - "\n", - "filename = \"dataset_\" + rater + \"_incentive_labels.csv\"\n", - "file = path + filename\n", - "with open(file, newline='') as f:\n", - " reader = csv.reader(f)\n", - " all_labels = list(reader)[0]\n", - "\n", - "filename = \"testset_\" + rater + \"_incentive_sentences.csv\"\n", - "file = path + filename\n", - "with open(file, newline='') as f:\n", - " reader = csv.reader(f)\n", - " test_sents = list(reader)[0]\n", - "\n", - "filename = \"testset_\" + rater + \"_incentive_labels.csv\"\n", - "file = path + filename\n", - "with open(file, newline='') as f:\n", - " reader = csv.reader(f)\n", - " test_labels = list(reader)[0]\n", - "\n", - "label_names = unique_labels(all_labels)\n", - "numeric_labels = labels2numeric(all_labels, label_names)\n", - "label_names" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "jmj6mDkxo42F", - "outputId": "35cf0641-0ce3-4e1e-94dd-8668679bfa84" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "('El pago de los recibos se hara en todo caso correlativamente, no siendo admisible el pago de uno de ellos dejando pendiente el anterior o anteriores',\n", - " 'not_Incentive',\n", - " 'Facilitar el acceso a mercados, servicios financieros, programas y proyectos.',\n", - " 'Incentive')" + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "RWngKIdpkPPg", + "outputId": "d48bc45c-616b-4c5a-c432-c84630978360" + }, + "source": [ + "import spacy\n", + "if spacy.prefer_gpu():\n", + " print(\"Using the GPU\")\n", + "else:\n", + " print(\"Using the CPU\")\n", + "es_nlp = spacy.load('es_core_news_lg')" + ], + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Using the GPU\n" + ], + "name": "stdout" + } ] - }, - "execution_count": 19, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "all_sents[2], all_labels[2], test_sents[2], test_labels[2]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 387 - }, - "id": "ij2Z9Hi4tWT4", - "outputId": "79ddc730-885f-4f0e-9c1d-cb25768c0cf3" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "55q23OCMkvD5" + }, + "source": [ + "For development work, in case you want to update the files in your GitHub branch by rerunning the clone, you first have to empty the folder." ] - }, - "metadata": { - "needs_background": "light", - "tags": [] - }, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Label counts:\n", - "{'not_Incentive': 0.2532786885245902, 'Incentive': 0.7467213114754099}\n" - ] - } - ], - "source": [ - "plot_data_distribution(numeric_labels, label_names)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bE911nx4uzfq" - }, - "source": [ - "As we can see, we have 3/4 of the data as Incentives and 1/4 as Non-incentives. Hopefully our model will be able to differentiate!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7iwJ0mc8uzNH" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Pm9OVVBXqFVj" - }, - "source": [ - "### Replicate Fine Tuning from Multi Class\n", - "\n", - "However, for time purposes, we will use the minimum number of parameters necessary so we can get to the binary classification setup part! When actually fine tuning, we should explore more hyperparameters. Things to change when running full:\n", - "- Add more test/validation data percentages in `all_test_perc`\n", - "- Include more models in `model_names`\n", - "- Comment out `num_epochs=10`, replace parameter `epochs` in `model.fit()` to be 2, uncomment `max_num_epochs` and uncomment for loop that iterates through epochs. \n", - "- Comment out the parameter `output_path` in `model.fit()`\n", - "- **IMPORTANT:** Since we don't store the models during normal training, we should create the directory in which the confusion matrix and t-SNE plots should be written to" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FrAaQLo6pI4m" - }, - "outputs": [], - "source": [ - "# Train test split stratified\n", - "all_test_perc = [0.2]\n", - "model_names = ['paraphrase-xlm-r-multilingual-v1']#, 'stsb-xlm-r-multilingual', 'quora-distilbert-multilingual', 'distiluse-base-multilingual-cased-v2', \n", - "\n", - "# Output setup\n", - "output = {}\n", - "\n", - "for test_perc in all_test_perc:\n", - " output[f\"test_perc={test_perc}\"] = {}\n", - " X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)\n", - "\n", - " # Load data samples into batches\n", - " train_batch_size = 16\n", - " label2int = dict(zip(label_names, range(len(label_names))))\n", - " train_samples = []\n", - " for sent, label in zip(X_train, y_train):\n", - " label_id = label2int[label]\n", - " train_samples.append(InputExample(texts=[sent], label=label_id))\n", - "\n", - " # Configure the dev set evaluator - still need to test whether this works\n", - " dev_samples = []\n", - " for sent, label in zip(X_test, y_test):\n", - " label_id = label2int[label]\n", - " dev_samples.append(InputExample(texts=[sent], label=label_id))\n", - " \n", - " for model_name in model_names:\n", - " # Setup\n", - " model_preds = []\n", - " model_scores = []\n", - " output[f\"test_perc={test_perc}\"][model_name] = []\n", - " \n", - " # Train set config\n", - " model = SentenceTransformer(model_name)\n", - " train_dataset = SentencesDataset(train_samples, model=model)\n", - " train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)\n", - " \n", - " # Define the way the loss is computed\n", - " classifier = SoftmaxClassifier(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))\n", - " \n", - " # Dev set config\n", - " dev_dataset = SentencesDataset(dev_samples, model=model)\n", - " dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size)\n", - " dev_evaluator = LabelAccuracyEvaluator(dataloader=dev_dataloader, softmax_model=classifier, name='lae-dev')\n", - "\n", - " # Configure the training\n", - " # max_num_epochs = 10\n", - " num_epochs = 10 # For now we will make the model train straight up to 10 epochs\n", - " \n", - " # for num_epochs in range(4, max_num_epochs + 2, 2):\n", - " print(\"Num epochs:\", num_epochs)\n", - " \n", - " warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) # 10% of train data for warm-up\n", - " model_deets = f\"model={model_name}_test-perc={test_perc}_n-epoch={num_epochs}\"\n", - " \n", - " model_save_path = f\"/content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/BinaryFineTuning_{model_deets}\"\n", - " # model_save_path = f\"/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/FineTuning_{model_deets}\"\n", - " \n", - "\n", - " # Train the model\n", - " start = time.time()\n", - "\n", - " # WARNING: epochs should be changed to 2 when testing multiple epochs... or maybe we should have another variable for it\n", - " model.fit(train_objectives=[(train_dataloader, classifier)],\n", - " evaluator=dev_evaluator,\n", - " epochs=num_epochs, # We always tune on an extra epoch to see the performance gain\n", - " evaluation_steps=1000,\n", - " warmup_steps=warmup_steps,\n", - " output_path=model_save_path\n", - " )\n", - " \n", - " end = time.time()\n", - " hours, rem = divmod(end-start, 3600)\n", - " minutes, seconds = divmod(rem, 60)\n", - " print(\"Time taken for fine-tuning:\", \"{:0>2}:{:0>2}:{:05.2f}\".format(int(hours),int(minutes),seconds))\n", - " \n", - " ### Classify sentences\n", - " # Projection matrix Z low-dim projection\n", - " print(\"Classifying sentences...\")\n", - " proj_matrix = cp.asnumpy(calc_proj_matrix(test_sents, 50, es_nlp, model, 0.01))\n", - " all_sent_embs = encode_all_sents(test_sents, model, proj_matrix)\n", - " all_label_embs = encode_labels(label_names, model, proj_matrix)\n", - " visualize_embeddings_2D(np.vstack(all_sent_embs), test_labels, tsne_perplexity=50, store_name=f\"{model_save_path}/{model_deets}\")\n", - " model_preds, model_scores = calc_all_cos_similarity(all_sent_embs, all_label_embs, label_names)\n", - " \n", - " ### Evaluate the model\n", - " numeric_preds = labels2numeric(model_preds, label_names)\n", - " numeric_test_labels = labels2numeric(test_labels, label_names)\n", - " evaluator = ModelEvaluator(label_names, y_true=numeric_test_labels, y_pred=numeric_preds)\n", - " \n", - " output[f\"test_perc={test_perc}\"][model_name].append({\"num_epochs\": num_epochs, \"avg_f1\": evaluator.avg_f1.tolist()})\n", - " \n", - " evaluator.plot_confusion_matrix(color_map='Blues', exp_name=f\"{model_save_path}/{model_deets}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VrIcnp1Wse60" - }, - "outputs": [], - "source": [ - "new_json = {}\n", - "\n", - "for key in output.keys():\n", - " new_json[key] = {}\n", - " for subkey in output[key].keys():\n", - " new_json[key][subkey] = []\n", - " for element in output[key][subkey]:\n", - " el_copy = {\"avg_f1\": element[\"avg_f1\"], \"num_epochs\": element[\"num_epochs\"]}\n", - " new_json[key][subkey].append(el_copy)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "aPlLyprt0ILI" - }, - "outputs": [], - "source": [ - "import json\n", - "output_path = f\"/content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/\"\n", - "# output_path = f\"/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/\"\n", - "with open(f\"{output_path}{Experiment}_FineTuningResults.json\", \"w\") as f:\n", - " json.dump(new_json, f)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wLx3D9nrtrDE" - }, - "source": [ - "## 2. Load fine-tuned model and use embeddings to train a binary classifier" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TepO8Pr3wsOX" - }, - "source": [ - "### Load model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "i_0lKmiGtw0N" - }, - "outputs": [], - "source": [ - "model_name = \"paraphrase-xlm-r-multilingual-v1\"\n", - "test_perc = 0.2\n", - "num_epochs = 10\n", - "model_deets = f\"model={model_name}_test-perc={test_perc}_n-epoch={num_epochs}\"\n", - "experiment = \"EXP9\"\n", - "saved_model_path = f\"/content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{experiment}/BinaryFineTuning_{model_deets}\"\n", - "# saved_model_path = f\"/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/FineTuning_{model_deets}\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "dHYaPvP5wZc_" - }, - "outputs": [], - "source": [ - "bin_model = SentenceTransformer(saved_model_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iY4NIIWbwu_M" - }, - "source": [ - "### Encode Sentences" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "t98Vpa-51t2K" - }, - "source": [ - "First, we will check how good are the fine tuned embeddings without the projection matrix addition" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "7Qv5doEyw7_r", - "outputId": "16e98bc1-6994-4996-e382-069ff6d3fe84" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 306/306 [00:05<00:00, 54.29it/s]\n" - ] - } - ], - "source": [ - "all_sent_embs = encode_all_sents(test_sents, bin_model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 412 - }, - "id": "XcEIw-sb1sFm", - "outputId": "0a6d1fad-c4c0-4d54-a905-c27fc005f7d7" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[t-SNE] Computing 151 nearest neighbors...\n", - "[t-SNE] Indexed 306 samples in 0.033s...\n", - "[t-SNE] Computed neighbors for 306 samples in 0.217s...\n", - "[t-SNE] Computed conditional probabilities for sample 306 / 306\n", - "[t-SNE] Mean sigma: 2.491708\n", - "[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.511703\n", - "[t-SNE] KL divergence after 1000 iterations: 0.449039\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + }, + { + "cell_type": "code", + "metadata": { + "id": "Sr1ACGUikx5K" + }, + "source": [ + "!rm -rf policy-data-analyzer/" + ], + "execution_count": 2, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FDSRezbdkxyI", + "outputId": "294fdc80-81e2-41d7-9739-713fd5e523c8" + }, + "source": [ + "# Define branch to clone\n", + "! branch_name='#50_dfq_sbert_fine_tuning' && \\\n", + " git clone --branch $branch_name https://github.com/wri-dssg/policy-data-analyzer.git" + ], + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Cloning into 'policy-data-analyzer'...\n", + "remote: Enumerating objects: 75, done.\u001b[K\n", + "remote: Counting objects: 100% (75/75), done.\u001b[K\n", + "remote: Compressing objects: 100% (52/52), done.\u001b[K\n", + "remote: Total 2994 (delta 47), reused 48 (delta 23), pack-reused 2919\u001b[K\n", + "Receiving objects: 100% (2994/2994), 129.28 MiB | 10.94 MiB/s, done.\n", + "Resolving deltas: 100% (1594/1594), done.\n", + "Checking out files: 100% (843/843), done.\n" + ], + "name": "stdout" + } ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "visualize_embeddings_2D(np.vstack(all_sent_embs), test_labels, tsne_perplexity=50)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5m3G_4eF2Tw4" - }, - "source": [ - "Ok, doesn't look that bad, but not perfect either... The incentives are scattered too much in the space, and the line between non-incentives and incentives is not clearly defined. ***For now, it doesn't matter - but we should experiment more with fine tuning.***.\n", - "\n", - "Now, let's check whether the projection matrix helps:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "auWe9D9S10mD", - "outputId": "9795754c-7182-4e75-b33b-e04c31dec83d" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 306/306 [00:07<00:00, 40.82it/s]\n" - ] - } - ], - "source": [ - "proj_matrix = cp.asnumpy(calc_proj_matrix(all_sents, 50, es_nlp, bin_model, 0.01))\n", - "all_sent_embs = encode_all_sents(test_sents, bin_model, proj_matrix)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 412 - }, - "id": "55hNSqtS19JE", - "outputId": "3a17d090-265b-40e0-de35-f2287f11efea" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[t-SNE] Computing 151 nearest neighbors...\n", - "[t-SNE] Indexed 306 samples in 0.006s...\n", - "[t-SNE] Computed neighbors for 306 samples in 0.083s...\n", - "[t-SNE] Computed conditional probabilities for sample 306 / 306\n", - "[t-SNE] Mean sigma: 20.201386\n", - "[t-SNE] KL divergence after 250 iterations with early exaggeration: 69.918625\n", - "[t-SNE] KL divergence after 1000 iterations: 0.734087\n" - ] - }, - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "56PH58KwkxsS", + "outputId": "73fc15d3-ffb5-4ec9-ff8b-fe1b35e5dadb" + }, + "source": [ + "import pandas as pd\n", + "import sys\n", + "import os\n", + "import csv\n", + "from sklearn.model_selection import train_test_split\n", + "from sentence_transformers import SentencesDataset, SentenceTransformer, InputExample, losses\n", + "from sentence_transformers.evaluation import LabelAccuracyEvaluator\n", + "from torch import nn, Tensor\n", + "from typing import Iterable, Dict\n", + "from torch.utils.data import DataLoader\n", + "import math\n", + "import time\n", + "import cupy as cp\n", + "\n", + "os.chdir(\"policy-data-analyzer\") #If you run this cell more than once, comment out this line because you are ready in this folder and you will get an error\n", + "from tasks.data_loader.src.utils import *\n", + "from tasks.data_augmentation.src.zero_shot_classification.latent_embeddings_classifier import *\n", + "from tasks.evaluate_model.src.model_evaluator import *\n", + "from tasks.data_visualization.src.plotting import *\n", + "\n", + "from google.colab import drive\n", + "drive.mount('/content/drive')" + ], + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n" + ], + "name": "stdout" + } ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "visualize_embeddings_2D(np.vstack(all_sent_embs), test_labels, tsne_perplexity=50)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BqqfluJR2-0B" - }, - "source": [ - "Actually, the projection matrix makes things worse. ***Let's NOT use it for now!!!!!***" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pLPUbG282_SO", - "outputId": "57745c51-8804-4dbf-ff60-9d5b475b8440" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 1220/1220 [00:22<00:00, 54.73it/s]\n" - ] - } - ], - "source": [ - "# Simple embeddings, no projection matrix added\n", - "all_sent_embs = encode_all_sents(all_sents, bin_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "3lvfAZQjAHvS", - "outputId": "41e1c935-6308-406a-bef8-014a4d941e5d" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 306/306 [00:05<00:00, 54.31it/s]\n" - ] - } - ], - "source": [ - "all_test_embs = encode_all_sents(test_sents, bin_model)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qyLt6Pl84UHc" - }, - "source": [ - "### Train classifiers" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": { - "id": "vCR_wG18E7aC" - }, - "outputs": [], - "source": [ - "from sklearn.model_selection import cross_val_score\n", - "from sklearn.metrics import classification_report" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rQyQIFVEBZsk" - }, - "source": [ - "1. Let's start with Random Forests!" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": { - "id": "YW021S1J_JH9" - }, - "outputs": [], - "source": [ - "from sklearn.ensemble import RandomForestClassifier" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "metadata": { - "id": "dbQ3_grn_I82" - }, - "outputs": [], - "source": [ - "clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=69420)" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "fAM7nFv3_IxT", - "outputId": "54c22d6a-7303-42d7-cdd3-819dad2633b8" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", - " criterion='gini', max_depth=3, max_features='auto',\n", - " max_leaf_nodes=None, max_samples=None,\n", - " min_impurity_decrease=0.0, min_impurity_split=None,\n", - " min_samples_leaf=1, min_samples_split=2,\n", - " min_weight_fraction_leaf=0.0, n_estimators=100,\n", - " n_jobs=None, oob_score=False, random_state=69420,\n", - " verbose=0, warm_start=False)" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0E6y9u8rmllu" + }, + "source": [ + "## 1. Fine tune SBERT on Binary Classification" ] - }, - "execution_count": 136, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "clf.fit(np.vstack(all_sent_embs), all_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "metadata": { - "id": "0MsZbft8_6_1" - }, - "outputs": [], - "source": [ - "clf_preds = [clf.predict(sent_emb)[0] for sent_emb in all_test_embs]" - ] - }, - { - "cell_type": "code", - "execution_count": 138, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2uLnsvS6GVVk", - "outputId": "5346cb46-8f87-4a18-83d8-be9391c26027" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " Incentive 0.89 0.93 0.91 228\n", - "not_Incentive 0.76 0.68 0.72 78\n", - "\n", - " accuracy 0.86 306\n", - " macro avg 0.83 0.80 0.81 306\n", - " weighted avg 0.86 0.86 0.86 306\n", - "\n" - ] - } - ], - "source": [ - " print(classification_report(test_labels, clf_preds))" - ] - }, - { - "cell_type": "code", - "execution_count": 139, - "metadata": { - "id": "Y_k1CWeJATgy" - }, - "outputs": [], - "source": [ - "numeric_preds = labels2numeric(clf_preds, label_names)\n", - "numeric_test_labels = labels2numeric(test_labels, label_names)\n", - "evaluator = ModelEvaluator(label_names, y_true=numeric_test_labels, y_pred=numeric_preds)" - ] - }, - { - "cell_type": "code", - "execution_count": 140, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 455 - }, - "id": "jHI9hf6EAy_N", - "outputId": "1544a62f-3b4c-4aaf-b98b-82d9d3044783" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + }, + { + "cell_type": "markdown", + "metadata": { + "id": "uEEBX_uPnQRN" + }, + "source": [ + "Similar setup for fine tuning multi-class. Maybe we should have this section in the other notebook, and in this one we just load the model? \n" ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "evaluator.plot_confusion_matrix(color_map='Blues')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yXdaChWFBrWg" - }, - "source": [ - "Honestly, without Grid Search and 5-fold Cross Validation, these are not bad results... We should add those though!" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Y3DoYgdaDHJ6" - }, - "source": [ - "2. Now, we're gonna try Support Vector Machines" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "metadata": { - "id": "lcNVNbZVDBX_" - }, - "outputs": [], - "source": [ - "from sklearn import svm" - ] - }, - { - "cell_type": "code", - "execution_count": 129, - "metadata": { - "id": "vD4-V3KpAy3k" - }, - "outputs": [], - "source": [ - "clf = svm.SVC(gamma=0.001, C=100.)" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "oeqA23VGAyoK", - "outputId": "3d905ff1-b4e8-4d2b-8887-5a6eb5e497e1" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,\n", - " decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',\n", - " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", - " tol=0.001, verbose=False)" + }, + { + "cell_type": "code", + "metadata": { + "id": "zWb8sqiXkxme" + }, + "source": [ + "class SoftmaxClassifier(nn.Module):\n", + " \"\"\"\n", + " This loss adds a softmax classifier on top of the output of the transformer network. \n", + " It takes a sentence embedding and learns a mapping between it and the corresponding category.\n", + " :param model: SentenceTransformer model\n", + " :param sentence_embedding_dimension: Dimension of your sentence embeddings\n", + " :param num_labels: Number of different labels\n", + " \"\"\"\n", + " def __init__(self,\n", + " model: SentenceTransformer,\n", + " sentence_embedding_dimension: int,\n", + " num_labels: int):\n", + " super(SoftmaxClassifier, self).__init__()\n", + " self.model = model\n", + " self.num_labels = num_labels\n", + " self.classifier = nn.Linear(sentence_embedding_dimension, num_labels)\n", + "\n", + " def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):\n", + " # Get batch sentence embeddings\n", + " features = self.model(sentence_features[0])['sentence_embedding']\n", + " \n", + " # Get batch loss\n", + " output = self.classifier(features)\n", + " loss_fct = nn.CrossEntropyLoss()\n", + "\n", + " if labels is not None:\n", + " loss = loss_fct(output, labels.view(-1))\n", + " return loss\n", + " else:\n", + " return features, output" + ], + "execution_count": 5, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "plcg1u8kqCCR" + }, + "source": [ + "### Load data\n", + "\n", + "- EXP1 = Binary classification fine tuning using the projection matrix\n", + "- EXP2 = Binary classification fine tuning ***without*** the projection matrix" ] - }, - "execution_count": 130, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "clf.fit(np.vstack(all_sent_embs), all_labels)" - ] - }, - { - "cell_type": "code", - "execution_count": 131, - "metadata": { - "id": "tyytG01oEC4G" - }, - "outputs": [], - "source": [ - "clf_preds = [clf.predict(sent_emb)[0] for sent_emb in all_test_embs]" - ] - }, - { - "cell_type": "code", - "execution_count": 132, - "metadata": { - "id": "SEneBVYFD3DW" - }, - "outputs": [], - "source": [ - "numeric_preds = labels2numeric(clf_preds, label_names)\n", - "numeric_test_labels = labels2numeric(test_labels, label_names)\n", - "evaluator = ModelEvaluator(label_names, y_true=numeric_test_labels, y_pred=numeric_preds)" - ] - }, - { - "cell_type": "code", - "execution_count": 133, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cscCU733D664", - "outputId": "51dd0385-c7ae-44f6-f090-936b4161136a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " precision recall f1-score support\n", - "\n", - " Incentive 0.91 0.92 0.92 228\n", - "not_Incentive 0.76 0.74 0.75 78\n", - "\n", - " accuracy 0.88 306\n", - " macro avg 0.84 0.83 0.84 306\n", - " weighted avg 0.87 0.88 0.88 306\n", - "\n" - ] - } - ], - "source": [ - "print(classification_report(test_labels, clf_preds))" - ] - }, - { - "cell_type": "code", - "execution_count": 134, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 455 - }, - "id": "oxYzT51aD60j", - "outputId": "c7af7030-d09a-43a0-c868-5ac36af22b7a" - }, - "outputs": [ - { - "data": { - "image/png": "\n", - "text/plain": [ - "
" + }, + { + "cell_type": "code", + "metadata": { + "id": "liOnrCm-kCVg" + }, + "source": [ + "rater = \"Rater3\" # TODO: Change accordingly to what is the dataset you want to analyze\n", + "Experiment = \"EXP2\" \n", + "\n", + "# This first one is the one used by David and Daniel\n", + "base_path = \"/content/drive/MyDrive/WRI-LatinAmerica-Talent\"\n", + "\n", + "# This one is the one used by Jordi\n", + "# base_path = \"/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent\"\n", + "\n", + "data_path = f\"{base_path}/Cristina_Policy_Files/Tagged_sentence_lists/datasets/\"\n", + "results_save_path = f\"{base_path}/Modeling/BinaryClassificationExperiments/{Experiment}\"\n", + "\n", + "if not os.path.exists(results_save_path):\n", + " os.makedirs(results_save_path)" + ], + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "CnN_0CAVkJRt", + "outputId": "f2ff82a7-9a12-43cb-dd0e-21fbd0dc33c5" + }, + "source": [ + "filename = \"dataset_\" + rater + \"_incentive_sentences.csv\"\n", + "file = data_path + filename\n", + "with open(file, newline='') as f:\n", + " reader = csv.reader(f)\n", + " all_sents = list(reader)[0]\n", + "\n", + "filename = \"dataset_\" + rater + \"_incentive_labels.csv\"\n", + "file = data_path + filename\n", + "with open(file, newline='') as f:\n", + " reader = csv.reader(f)\n", + " all_labels = list(reader)[0]\n", + "\n", + "filename = \"testset_\" + rater + \"_incentive_sentences.csv\"\n", + "file = data_path + filename\n", + "with open(file, newline='') as f:\n", + " reader = csv.reader(f)\n", + " test_sents = list(reader)[0]\n", + "\n", + "filename = \"testset_\" + rater + \"_incentive_labels.csv\"\n", + "file = data_path + filename\n", + "with open(file, newline='') as f:\n", + " reader = csv.reader(f)\n", + " test_labels = list(reader)[0]\n", + "\n", + "label_names = unique_labels(all_labels)\n", + "numeric_labels = labels2numeric(all_labels, label_names)\n", + "label_names" + ], + "execution_count": 7, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['not_Incentive', 'Incentive']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 7 + } ] - }, - "metadata": { - "tags": [] - }, - "output_type": "display_data" - } - ], - "source": [ - "evaluator.plot_confusion_matrix(color_map='Blues')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "H-ki8dU5EUaq" - }, - "source": [ - "Ok, so SVMs are slightly better than Random Forests at ***differentiating*** text! There's a 1-2% decrease in performance for the incentive class, but a 6% gain in non-incentives. If these results remain when doing cross validation and grid search, then I'd recommend going for the SVMs.\n", - "\n", - "\n", - "**Next steps:**\n", - "- Add Grid Search Cross Validation from sklearn" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9y34yQNqD6TO" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mbfAvxu0-_Qi" - }, - "source": [ - "### What about... Beto?\n", - "I downloaded the weights and placed them in the folder below: " - ] - }, - { - "cell_type": "code", - "execution_count": 142, - "metadata": { - "id": "mxk0ZHUFHnkk" - }, - "outputs": [], - "source": [ - "beto_path = f\"/content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/BETO/pytorch/\"" - ] - }, - { - "cell_type": "code", - "execution_count": 145, - "metadata": { - "id": "7lHUcXtY3PKP" - }, - "outputs": [], - "source": [ - "from transformers import AutoTokenizer, AutoModelForSequenceClassification\n", - "from transformers import BertTokenizer, BertForSequenceClassification" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "TI9IF0UmKYZE" - }, - "source": [ - "***IMPORTANT:*** I was not able to figure out a way of using the fine tuning results from the models above so I'm gonna use BETO out of the box, for both encoding/classification and see how it goes.\n", - "\n", - "The following cells are a demo of how the model should be put to use - once you understand it, feel free to skip this part!" - ] - }, - { - "cell_type": "code", - "execution_count": 146, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "kmJ_mpC37ICQ", - "outputId": "d3d7a3f6-8717-4e50-d554-18779d622b1e" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of the model checkpoint at /content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/BETO/pytorch/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n", - "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", - "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", - "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/BETO/pytorch/ and are newly initialized: ['classifier.weight', 'classifier.bias']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - } - ], - "source": [ - "tokenizer = BertTokenizer.from_pretrained(beto_path)\n", - "model = BertForSequenceClassification.from_pretrained(beto_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 148, - "metadata": { - "id": "EGRL66pBIzuy" - }, - "outputs": [], - "source": [ - "classes = [\"no es parafrasis\", \"es parafrasis\"]\n", - "\n", - "sequence_0 = \"La compañia Hugging esta basada en Nueva York\"\n", - "sequence_1 = \"Las manzanas son malas para la salud\"\n", - "sequence_2 = \"La sede principal de Hugging esta en Manhattan\"\n", - "\n", - "paraphrase = tokenizer(sequence_0, sequence_2, return_tensors=\"pt\")\n", - "not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors=\"pt\")\n", - "\n", - "paraphrase_classification_logits = model(**paraphrase).logits\n", - "not_paraphrase_classification_logits = model(**not_paraphrase).logits\n", - "\n", - "paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]\n", - "not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 156, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "C-Pkqe-vI3ld", - "outputId": "5d208439-c5ae-4da4-b6a1-e748c3a2f839" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - ">>> Deberia de ser parafrasis:\n", - "no es parafrasis: 41%\n", - "es parafrasis: 59%\n", - "\n", - ">>> NO deberia de ser parafrasis:\n", - "no es parafrasis: 61%\n", - "es parafrasis: 39%\n" - ] - } - ], - "source": [ - "print(\">>> Deberia de ser parafrasis:\")\n", - "for i in range(len(classes)):\n", - " print(f\"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%\")\n", - "\n", - "print(\"\\n>>> NO deberia de ser parafrasis:\")\n", - "for i in range(len(classes)):\n", - " print(f\"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0dRr9sJQKezT" - }, - "source": [ - "OK! Now it's time to apply it to our data. We will try it out with our test set, just to have a fair comparison" - ] - }, - { - "cell_type": "code", - "execution_count": 159, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 362 - }, - "id": "ybHMwJE-JyvG", - "outputId": "66e60822-56a7-45ff-c01a-96e9c54ded9b" - }, - "outputs": [ - { - "ename": "RuntimeError", - "evalue": "ignored", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mRuntimeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mtokenized_sents\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_sents\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpadding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreturn_tensors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"pt\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mclf_logits\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mtokenized_sents\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlogits\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mclf_results\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msoftmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mclf_logits\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtolist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 1495\u001b[0m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1496\u001b[0m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1497\u001b[0;31m \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1498\u001b[0m )\n\u001b[1;32m 1499\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 966\u001b[0m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 967\u001b[0m \u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0moutput_hidden_states\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 968\u001b[0;31m \u001b[0mreturn_dict\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mreturn_dict\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 969\u001b[0m )\n\u001b[1;32m 970\u001b[0m \u001b[0msequence_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mencoder_outputs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 564\u001b[0m \u001b[0mencoder_attention_mask\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 565\u001b[0m \u001b[0mpast_key_value\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 566\u001b[0;31m \u001b[0moutput_attentions\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 567\u001b[0m )\n\u001b[1;32m 568\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)\u001b[0m\n\u001b[1;32m 494\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 495\u001b[0m layer_output = apply_chunking_to_forward(\n\u001b[0;32m--> 496\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeed_forward_chunk\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mchunk_size_feed_forward\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mseq_len_dim\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 497\u001b[0m )\n\u001b[1;32m 498\u001b[0m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mlayer_output\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0moutputs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/modeling_utils.py\u001b[0m in \u001b[0;36mapply_chunking_to_forward\u001b[0;34m(forward_fn, chunk_size, chunk_dim, *input_tensors)\u001b[0m\n\u001b[1;32m 1785\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0moutput_chunks\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdim\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mchunk_dim\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1786\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1787\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mforward_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput_tensors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mfeed_forward_chunk\u001b[0;34m(self, attention_output)\u001b[0m\n\u001b[1;32m 505\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 506\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfeed_forward_chunk\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 507\u001b[0;31m \u001b[0mintermediate_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintermediate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 508\u001b[0m \u001b[0mlayer_output\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moutput\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mintermediate_output\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mattention_output\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 509\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mlayer_output\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py\u001b[0m in \u001b[0;36m_call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m 725\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_slow_forward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 726\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 727\u001b[0;31m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 728\u001b[0m for hook in itertools.chain(\n\u001b[1;32m 729\u001b[0m \u001b[0m_global_forward_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/models/bert/modeling_bert.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, hidden_states)\u001b[0m\n\u001b[1;32m 409\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 410\u001b[0m \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdense\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 411\u001b[0;31m \u001b[0mhidden_states\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mintermediate_act_fn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mhidden_states\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 412\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mhidden_states\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 413\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py\u001b[0m in \u001b[0;36mgelu\u001b[0;34m(input)\u001b[0m\n\u001b[1;32m 1381\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mTensor\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mhas_torch_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1382\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mhandle_torch_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mgelu\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1383\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_C\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_nn\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgelu\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0minput\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1384\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1385\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mRuntimeError\u001b[0m: CUDA out of memory. Tried to allocate 768.00 MiB (GPU 0; 14.73 GiB total capacity; 11.33 GiB already allocated; 413.88 MiB free; 12.41 GiB reserved in total by PyTorch)" - ] - } - ], - "source": [ - "tokenized_sents = tokenizer(test_sents, padding=True, return_tensors=\"pt\")\n", - "clf_logits = model(**tokenized_sents).logits\n", - "clf_results = torch.softmax(clf_logits, dim=1).tolist()[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wELlB7iUJ1T3" - }, - "outputs": [], - "source": [ - "# This stores the index of the highest score - in other words, our label\n", - "clf_preds = [np.argmax(logits) for logits in clf_results]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GnrG0AwcM-G5" - }, - "outputs": [], - "source": [ - "print(classification_report(test_labels, clf_preds))" - ] - }, - { - "cell_type": "code", - "execution_count": 161, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" }, - "id": "qdV91B4IMZdz", - "outputId": "c6903dcb-c967-428f-a157-42717317da82" - }, - "outputs": [ { - "data": { - "text/plain": [ - "1" + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "jmj6mDkxo42F", + "outputId": "ab072c4d-f317-46f6-f4f7-e49dce446f45" + }, + "source": [ + "all_sents[2], all_labels[2], test_sents[2], test_labels[2]" + ], + "execution_count": 8, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "('El pago de los recibos se hara en todo caso correlativamente, no siendo admisible el pago de uno de ellos dejando pendiente el anterior o anteriores',\n", + " 'not_Incentive',\n", + " 'Facilitar el acceso a mercados, servicios financieros, programas y proyectos.',\n", + " 'Incentive')" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 8 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 387 + }, + "id": "ij2Z9Hi4tWT4", + "outputId": "66e403a1-ec8b-4775-9846-a0efea3d985e" + }, + "source": [ + "plot_data_distribution(numeric_labels, label_names)" + ], + "execution_count": 9, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [], + "needs_background": "light" + } + }, + { + "output_type": "stream", + "text": [ + "Label counts:\n", + "{'not_Incentive': 0.2532786885245902, 'Incentive': 0.7467213114754099}\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bE911nx4uzfq" + }, + "source": [ + "As we can see, we have 3/4 of the data as Incentives and 1/4 as Non-incentives. Hopefully our model will be able to differentiate!" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "7iwJ0mc8uzNH" + }, + "source": [ + "" + ], + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pm9OVVBXqFVj" + }, + "source": [ + "### Replicate Fine Tuning from Multi Class\n", + "\n", + "However, for time purposes, we will use the minimum number of parameters necessary so we can get to the binary classification setup part! When actually fine tuning, we should explore more hyperparameters. Things to change when running full:\n", + "- Add more test/validation data percentages in `all_test_perc`\n", + "- Include more models in `model_names`\n", + "- Comment out `num_epochs=10`, replace parameter `epochs` in `model.fit()` to be 2, uncomment `max_num_epochs` and uncomment for loop that iterates through epochs. \n", + "- Comment out the parameter `output_path` in `model.fit()`\n", + "- **IMPORTANT:** Since we don't store the model output path" ] - }, - "execution_count": 161, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" + }, + { + "cell_type": "code", + "metadata": { + "id": "a5vcnVTrCPa8" + }, + "source": [ + "import json" + ], + "execution_count": 10, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "irVDlABFCRSp" + }, + "source": [ + "output_path = f\"{base_path}/Modeling/BinaryClassificationExperiments/{Experiment}/\"\n", + "# output_path = f\"/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/\"" + ], + "execution_count": 11, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "FrAaQLo6pI4m" + }, + "source": [ + "# Configure the training\n", + "all_test_perc = [0.15, 0.2, 0.25]\n", + "model_names = ['paraphrase-xlm-r-multilingual-v1', 'stsb-xlm-r-multilingual', 'quora-distilbert-multilingual', 'distiluse-base-multilingual-cased-v2']\n", + "\n", + "# If you want to train for a set number of epochs instead of a range, set all these numbers to be equal\n", + "start_epochs = 4\n", + "epochs_increment = 2\n", + "max_num_epochs = 12\n", + "\n", + "\n", + "# Output setup - we will update the json as the fine tuning process goes so every result is stored immediately\n", + "with open(f\"{output_path}{Experiment}_FineTuningResults.json\", \"w\") as fw:\n", + " json.dump({}, fw)\n", + "\n", + "for test_perc in all_test_perc:\n", + " with open(f\"{output_path}{Experiment}_FineTuningResults.json\", \"r\") as fr:\n", + " output = json.load(fr)\n", + "\n", + " output[f\"test_perc={test_perc}\"] = {}\n", + " X_train, X_test, y_train, y_test = train_test_split(all_sents, all_labels, test_size=test_perc, stratify=all_labels, random_state=69420)\n", + "\n", + " # Load data samples into batches\n", + " train_batch_size = 16\n", + " label2int = dict(zip(label_names, range(len(label_names))))\n", + " train_samples = []\n", + " for sent, label in zip(X_train, y_train):\n", + " label_id = label2int[label]\n", + " train_samples.append(InputExample(texts=[sent], label=label_id))\n", + "\n", + " # Configure the dev set evaluator - still need to test whether this works\n", + " dev_samples = []\n", + " for sent, label in zip(X_test, y_test):\n", + " label_id = label2int[label]\n", + " dev_samples.append(InputExample(texts=[sent], label=label_id))\n", + " \n", + " for model_name in model_names:\n", + " # Setup\n", + " model_preds = []\n", + " model_scores = []\n", + " output[f\"test_perc={test_perc}\"][model_name] = []\n", + " \n", + " # Train set config\n", + " model = SentenceTransformer(model_name)\n", + " train_dataset = SentencesDataset(train_samples, model=model)\n", + " train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)\n", + " \n", + " # Define the way the loss is computed\n", + " classifier = SoftmaxClassifier(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))\n", + " \n", + " # Dev set config\n", + " dev_dataset = SentencesDataset(dev_samples, model=model)\n", + " dev_dataloader = DataLoader(dev_dataset, shuffle=True, batch_size=train_batch_size)\n", + " dev_evaluator = LabelAccuracyEvaluator(dataloader=dev_dataloader, softmax_model=classifier, name='lae-dev')\n", + " \n", + " for num_epochs in range(start_epochs, max_num_epochs + 2, epochs_increment):\n", + " print(\"Num epochs:\", num_epochs)\n", + " \n", + " warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) # 10% of train data for warm-up\n", + " model_deets = f\"model={model_name}_test-perc={test_perc}_n-epoch={num_epochs}\"\n", + " \n", + " # Train the model\n", + " start = time.time()\n", + " \n", + " if num_epochs == start_epochs:\n", + " model.fit(train_objectives=[(train_dataloader, classifier)],\n", + " evaluator=dev_evaluator,\n", + " epochs=start_epochs, \n", + " evaluation_steps=1000,\n", + " warmup_steps=warmup_steps,\n", + " output_path=f\"{results_save_path}/BinaryCls_{model_deets}\"\n", + " )\n", + " else:\n", + " model.fit(train_objectives=[(train_dataloader, classifier)],\n", + " evaluator=dev_evaluator,\n", + " epochs=epochs_increment, # We always tune on an extra epoch to see the performance gain\n", + " evaluation_steps=1000,\n", + " warmup_steps=warmup_steps,\n", + " output_path=f\"{results_save_path}/BinaryCls_{model_deets}\"\n", + " )\n", + " \n", + " end = time.time()\n", + " hours, rem = divmod(end-start, 3600)\n", + " minutes, seconds = divmod(rem, 60)\n", + " print(\"Time taken for fine-tuning:\", \"{:0>2}:{:0>2}:{:05.2f}\".format(int(hours),int(minutes),seconds))\n", + " \n", + " ### Classify sentences\n", + " # Projection matrix Z low-dim projection\n", + " print(\"Classifying sentences...\")\n", + " proj_matrix = cp.asnumpy(calc_proj_matrix(test_sents, 50, es_nlp, model, 0.01))\n", + " all_sent_embs = encode_all_sents(test_sents, model, proj_matrix)\n", + " all_label_embs = encode_labels(label_names, model, proj_matrix)\n", + " visualize_embeddings_2D(np.vstack(all_sent_embs), test_labels, tsne_perplexity=50, store_name=f\"{results_save_path}/BinaryCls_{model_deets}/{model_deets}\")\n", + " model_preds, model_scores = calc_all_cos_similarity(all_sent_embs, all_label_embs, label_names)\n", + " \n", + " ### Evaluate the model\n", + " numeric_preds = labels2numeric(model_preds, label_names)\n", + " numeric_test_labels = labels2numeric(test_labels, label_names)\n", + " evaluator = ModelEvaluator(label_names, y_true=numeric_test_labels, y_pred=numeric_preds)\n", + " \n", + " output[f\"test_perc={test_perc}\"][model_name].append({\"num_epochs\": num_epochs, \"avg_f1\": evaluator.avg_f1.tolist()})\n", + " with open(f\"{output_path}{Experiment}_FineTuningResults.json\", \"w\") as fw:\n", + " json.dump(output, fw)\n", + "\n", + " evaluator.plot_confusion_matrix(color_map='Blues', exp_name=f\"{results_save_path}/BinaryCls_{model_deets}/{model_deets}\")" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "VrIcnp1Wse60" + }, + "source": [ + "new_json = {}\n", + "\n", + "for key in output.keys():\n", + " new_json[key] = {}\n", + " for subkey in output[key].keys():\n", + " new_json[key][subkey] = []\n", + " for element in output[key][subkey]:\n", + " el_copy = {\"avg_f1\": element[\"avg_f1\"], \"num_epochs\": element[\"num_epochs\"]}\n", + " new_json[key][subkey].append(el_copy)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "aPlLyprt0ILI" + }, + "source": [ + "import json\n", + "output_path = f\"/content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/\"\n", + "# output_path = f\"/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/\"\n", + "with open(f\"{output_path}{Experiment}_FineTuningResults.json\", \"w\") as f:\n", + " json.dump(new_json, f)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "wLx3D9nrtrDE" + }, + "source": [ + "## 2. Load fine-tuned model and use embeddings to train a binary classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TepO8Pr3wsOX" + }, + "source": [ + "### Load model" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "i_0lKmiGtw0N" + }, + "source": [ + "model_name = \"paraphrase-xlm-r-multilingual-v1\"\n", + "test_perc = 0.2\n", + "num_epochs = 10\n", + "model_deets = f\"model={model_name}_test-perc={test_perc}_n-epoch={num_epochs}\"\n", + "experiment = \"EXP9\"\n", + "saved_model_path = f\"{base_path}/Modeling/FineTuningExperiments/BinaryFineTuning_{experiment}\"\n", + "# saved_model_path = f\"/content/drive/MyDrive/Official Folder of WRI Latin America Project/WRI-LatinAmerica-Talent/Modeling/FineTuningExperiments/{Experiment}/FineTuning_{model_deets}\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "dHYaPvP5wZc_" + }, + "source": [ + "bin_model = SentenceTransformer(saved_model_path)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iY4NIIWbwu_M" + }, + "source": [ + "### Encode Sentences" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "t98Vpa-51t2K" + }, + "source": [ + "First, we will check how good are the fine tuned embeddings without the projection matrix addition" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7Qv5doEyw7_r", + "outputId": "16e98bc1-6994-4996-e382-069ff6d3fe84" + }, + "source": [ + "all_sent_embs = encode_all_sents(test_sents, bin_model)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "100%|██████████| 306/306 [00:05<00:00, 54.29it/s]\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 412 + }, + "id": "XcEIw-sb1sFm", + "outputId": "0a6d1fad-c4c0-4d54-a905-c27fc005f7d7" + }, + "source": [ + "visualize_embeddings_2D(np.vstack(all_sent_embs), test_labels, tsne_perplexity=50)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[t-SNE] Computing 151 nearest neighbors...\n", + "[t-SNE] Indexed 306 samples in 0.033s...\n", + "[t-SNE] Computed neighbors for 306 samples in 0.217s...\n", + "[t-SNE] Computed conditional probabilities for sample 306 / 306\n", + "[t-SNE] Mean sigma: 2.491708\n", + "[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.511703\n", + "[t-SNE] KL divergence after 1000 iterations: 0.449039\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5m3G_4eF2Tw4" + }, + "source": [ + "Ok, doesn't look that bad, but not perfect either... The incentives are scattered too much in the space, and the line between non-incentives and incentives is not clearly defined. ***For now, it doesn't matter - but we should experiment more with fine tuning.***.\n", + "\n", + "Now, let's check whether the projection matrix helps:" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "auWe9D9S10mD", + "outputId": "9795754c-7182-4e75-b33b-e04c31dec83d" + }, + "source": [ + "proj_matrix = cp.asnumpy(calc_proj_matrix(all_sents, 50, es_nlp, bin_model, 0.01))\n", + "all_sent_embs = encode_all_sents(test_sents, bin_model, proj_matrix)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "100%|██████████| 306/306 [00:07<00:00, 40.82it/s]\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 412 + }, + "id": "55hNSqtS19JE", + "outputId": "3a17d090-265b-40e0-de35-f2287f11efea" + }, + "source": [ + "visualize_embeddings_2D(np.vstack(all_sent_embs), test_labels, tsne_perplexity=50)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "[t-SNE] Computing 151 nearest neighbors...\n", + "[t-SNE] Indexed 306 samples in 0.006s...\n", + "[t-SNE] Computed neighbors for 306 samples in 0.083s...\n", + "[t-SNE] Computed conditional probabilities for sample 306 / 306\n", + "[t-SNE] Mean sigma: 20.201386\n", + "[t-SNE] KL divergence after 250 iterations with early exaggeration: 69.918625\n", + "[t-SNE] KL divergence after 1000 iterations: 0.734087\n" + ], + "name": "stdout" + }, + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BqqfluJR2-0B" + }, + "source": [ + "Actually, the projection matrix makes things worse. ***Let's NOT use it for now!!!!!***" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pLPUbG282_SO", + "outputId": "57745c51-8804-4dbf-ff60-9d5b475b8440" + }, + "source": [ + "# Simple embeddings, no projection matrix added\n", + "all_sent_embs = encode_all_sents(all_sents, bin_model)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "100%|██████████| 1220/1220 [00:22<00:00, 54.73it/s]\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "3lvfAZQjAHvS", + "outputId": "41e1c935-6308-406a-bef8-014a4d941e5d" + }, + "source": [ + "all_test_embs = encode_all_sents(test_sents, bin_model)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "100%|██████████| 306/306 [00:05<00:00, 54.31it/s]\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "qyLt6Pl84UHc" + }, + "source": [ + "### Train classifiers" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "vCR_wG18E7aC" + }, + "source": [ + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.metrics import classification_report" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "rQyQIFVEBZsk" + }, + "source": [ + "1. Let's start with Random Forests!" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "YW021S1J_JH9" + }, + "source": [ + "from sklearn.ensemble import RandomForestClassifier" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "dbQ3_grn_I82" + }, + "source": [ + "clf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=69420)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "fAM7nFv3_IxT", + "outputId": "54c22d6a-7303-42d7-cdd3-819dad2633b8" + }, + "source": [ + "clf.fit(np.vstack(all_sent_embs), all_labels)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,\n", + " criterion='gini', max_depth=3, max_features='auto',\n", + " max_leaf_nodes=None, max_samples=None,\n", + " min_impurity_decrease=0.0, min_impurity_split=None,\n", + " min_samples_leaf=1, min_samples_split=2,\n", + " min_weight_fraction_leaf=0.0, n_estimators=100,\n", + " n_jobs=None, oob_score=False, random_state=69420,\n", + " verbose=0, warm_start=False)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 136 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "0MsZbft8_6_1" + }, + "source": [ + "clf_preds = [clf.predict(sent_emb)[0] for sent_emb in all_test_embs]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2uLnsvS6GVVk", + "outputId": "5346cb46-8f87-4a18-83d8-be9391c26027" + }, + "source": [ + " print(classification_report(test_labels, clf_preds))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " Incentive 0.89 0.93 0.91 228\n", + "not_Incentive 0.76 0.68 0.72 78\n", + "\n", + " accuracy 0.86 306\n", + " macro avg 0.83 0.80 0.81 306\n", + " weighted avg 0.86 0.86 0.86 306\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "Y_k1CWeJATgy" + }, + "source": [ + "numeric_preds = labels2numeric(clf_preds, label_names)\n", + "numeric_test_labels = labels2numeric(test_labels, label_names)\n", + "evaluator = ModelEvaluator(label_names, y_true=numeric_test_labels, y_pred=numeric_preds)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 455 + }, + "id": "jHI9hf6EAy_N", + "outputId": "1544a62f-3b4c-4aaf-b98b-82d9d3044783" + }, + "source": [ + "evaluator.plot_confusion_matrix(color_map='Blues')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "yXdaChWFBrWg" + }, + "source": [ + "Honestly, without Grid Search and 5-fold Cross Validation, these are not bad results... We should add those though!" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Y3DoYgdaDHJ6" + }, + "source": [ + "2. Now, we're gonna try Support Vector Machines" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "lcNVNbZVDBX_" + }, + "source": [ + "from sklearn import svm" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "vD4-V3KpAy3k" + }, + "source": [ + "clf = svm.SVC(gamma=0.001, C=100.)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "oeqA23VGAyoK", + "outputId": "3d905ff1-b4e8-4d2b-8887-5a6eb5e497e1" + }, + "source": [ + "clf.fit(np.vstack(all_sent_embs), all_labels)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "SVC(C=100.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,\n", + " decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',\n", + " max_iter=-1, probability=False, random_state=None, shrinking=True,\n", + " tol=0.001, verbose=False)" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 130 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "tyytG01oEC4G" + }, + "source": [ + "clf_preds = [clf.predict(sent_emb)[0] for sent_emb in all_test_embs]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "SEneBVYFD3DW" + }, + "source": [ + "numeric_preds = labels2numeric(clf_preds, label_names)\n", + "numeric_test_labels = labels2numeric(test_labels, label_names)\n", + "evaluator = ModelEvaluator(label_names, y_true=numeric_test_labels, y_pred=numeric_preds)" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cscCU733D664", + "outputId": "51dd0385-c7ae-44f6-f090-936b4161136a" + }, + "source": [ + "print(classification_report(test_labels, clf_preds))" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " Incentive 0.91 0.92 0.92 228\n", + "not_Incentive 0.76 0.74 0.75 78\n", + "\n", + " accuracy 0.88 306\n", + " macro avg 0.84 0.83 0.84 306\n", + " weighted avg 0.87 0.88 0.88 306\n", + "\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 455 + }, + "id": "oxYzT51aD60j", + "outputId": "c7af7030-d09a-43a0-c868-5ac36af22b7a" + }, + "source": [ + "evaluator.plot_confusion_matrix(color_map='Blues')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "display_data", + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "tags": [] + } + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "H-ki8dU5EUaq" + }, + "source": [ + "Ok, so SVMs are slightly better than Random Forests at ***differentiating*** text! There's a 1-2% decrease in performance for the incentive class, but a 6% gain in non-incentives. If these results remain when doing cross validation and grid search, then I'd recommend going for the SVMs.\n", + "\n", + "\n", + "**Next steps:**\n", + "- Add Grid Search Cross Validation from sklearn" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9y34yQNqD6TO" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mbfAvxu0-_Qi" + }, + "source": [ + "### What about... Beto?\n", + "I downloaded the weights and placed them in the folder below: " + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "n8I9IyxRrCm5", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "82ed8f16-16d3-4b71-c0e2-31304ae96327" + }, + "source": [ + "!pip install transformers" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: transformers in /usr/local/lib/python3.6/dist-packages (4.2.1)\n", + "Requirement already satisfied: importlib-metadata; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from transformers) (3.3.0)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)\n", + "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.8)\n", + "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.19.5)\n", + "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)\n", + "Requirement already satisfied: tokenizers==0.9.4 in /usr/local/lib/python3.6/dist-packages (from transformers) (0.9.4)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)\n", + "Requirement already satisfied: sacremoses in /usr/local/lib/python3.6/dist-packages (from transformers) (0.0.43)\n", + "Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.8)\n", + "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)\n", + "Requirement already satisfied: typing-extensions>=3.6.4; python_version < \"3.8\" in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.7.4.3)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata; python_version < \"3.8\"->transformers) (3.4.0)\n", + "Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.0.0)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)\n", + "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)\n", + "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)\n", + "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.12.5)\n", + "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "mxk0ZHUFHnkk" + }, + "source": [ + "beto_path = f\"{base_path}/Modeling/BETO/pytorch/\"" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "7lHUcXtY3PKP" + }, + "source": [ + "from transformers import BertTokenizer, BertForSequenceClassification" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "TI9IF0UmKYZE" + }, + "source": [ + "***IMPORTANT:*** I was not able to figure out a way of using the fine tuning results from the models above so I'm gonna use BETO out of the box, for both encoding/classification and see how it goes.\n", + "\n", + "The following cells are a demo of how the model should be put to use - once you understand it, feel free to skip this part!" + ] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "kmJ_mpC37ICQ", + "outputId": "f350a9e6-e3a8-476e-846f-43b387302ce1" + }, + "source": [ + "tokenizer = BertTokenizer.from_pretrained(beto_path)\n", + "model = BertForSequenceClassification.from_pretrained(beto_path)" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Some weights of the model checkpoint at /content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/BETO/pytorch/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']\n", + "- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", + "- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", + "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/WRI-LatinAmerica-Talent/Modeling/BETO/pytorch/ and are newly initialized: ['classifier.weight', 'classifier.bias']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ], + "name": "stderr" + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "EGRL66pBIzuy" + }, + "source": [ + "classes = [\"no es parafrasis\", \"es parafrasis\"]\n", + "\n", + "sequence_0 = \"La compañia Hugging esta basada en Nueva York\"\n", + "sequence_1 = \"Las manzanas son malas para la salud\"\n", + "sequence_2 = \"La sede principal de Hugging esta en Manhattan\"\n", + "\n", + "paraphrase = tokenizer(sequence_0, sequence_2, return_tensors=\"pt\")\n", + "not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors=\"pt\")\n", + "\n", + "paraphrase_classification_logits = model(**paraphrase).logits\n", + "not_paraphrase_classification_logits = model(**not_paraphrase).logits\n", + "\n", + "paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]\n", + "not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "C-Pkqe-vI3ld", + "outputId": "5d208439-c5ae-4da4-b6a1-e748c3a2f839" + }, + "source": [ + "print(\">>> Deberia de ser parafrasis:\")\n", + "for i in range(len(classes)):\n", + " print(f\"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%\")\n", + "\n", + "print(\"\\n>>> NO deberia de ser parafrasis:\")\n", + "for i in range(len(classes)):\n", + " print(f\"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%\")" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "stream", + "text": [ + ">>> Deberia de ser parafrasis:\n", + "no es parafrasis: 41%\n", + "es parafrasis: 59%\n", + "\n", + ">>> NO deberia de ser parafrasis:\n", + "no es parafrasis: 61%\n", + "es parafrasis: 39%\n" + ], + "name": "stdout" + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0dRr9sJQKezT" + }, + "source": [ + "OK! Now it's time to apply it to our data. We will try it out with our test set, just to have a fair comparison" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ybHMwJE-JyvG" + }, + "source": [ + "tokenized_sents = tokenizer(test_sents, padding=True, return_tensors=\"pt\")\n", + "clf_logits = model(**tokenized_sents).logits\n", + "clf_results = torch.softmax(clf_logits, dim=1).tolist()[0]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "wELlB7iUJ1T3" + }, + "source": [ + "# This stores the index of the highest score - in other words, our label\n", + "clf_preds = [np.argmax(logits) for logits in clf_results]" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "id": "GnrG0AwcM-G5" + }, + "source": [ + "print(classification_report(test_labels, clf_preds))" + ], + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "qdV91B4IMZdz", + "outputId": "c6903dcb-c967-428f-a157-42717317da82" + }, + "source": [ + "evaluator = ModelEvaluator(label_names, y_true=numeric_test_labels, y_pred=clf_preds)\n", + "evaluator.plot_confusion_matrix(color_map='Blues')" + ], + "execution_count": null, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "1" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 161 + } + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "eMEJ67HrMYtN" + }, + "source": [ + "" + ], + "execution_count": null, + "outputs": [] } - ], - "source": [ - "evaluator = ModelEvaluator(label_names, y_true=numeric_test_labels, y_pred=clf_preds)\n", - "evaluator.plot_confusion_matrix(color_map='Blues')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0cuHUa-MMZJR" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eMEJ67HrMYtN" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "BinaryClassifierGoogleColab.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - }, - "toc": { - "base_numbering": 1, - "nav_menu": {}, - "number_sections": true, - "sideBar": true, - "skip_h1_title": false, - "title_cell": "Table of Contents", - "title_sidebar": "Contents", - "toc_cell": false, - "toc_position": {}, - "toc_section_display": true, - "toc_window_display": false - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} + ] +} \ No newline at end of file