# GEC Bert - Inference and Analysis

## Imports and Setup

In [2]:
%load_ext autoreload
%autoreload 2
PLATFORM='GCP' # 'GCP' or 'AWS' or 'LOCAL'

# Disable HuggingFace's parallel tokenization feature to avoid any deadlock with our small dataset.
%env TOKENIZERS_PARALLELISM=false

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
env: TOKENIZERS_PARALLELISM=false


In [3]:
# GCP specific setup.
if PLATFORM == 'GCP':
    # Connect to google drive
    # from google.colab import drive
    # drive.mount('/content/drive')

    # Clone repo and install required libraries
    !git clone https://ram-senth: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@github.com/team-langbot/model_gec.git

    # !git checkout -b model origin/model

!git config --global user.email "ram.senth@berkeley.edu"
!git config --global user.name "Ram S"

Cloning into 'model_gec'...
remote: Enumerating objects: 300, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 300 (delta 31), reused 41 (delta 18), pack-reused 227[K
Receiving objects: 100% (300/300), 21.17 MiB | 15.00 MiB/s, done.
Resolving deltas: 100% (140/140), done.


In [4]:
%cd /content/model_gec
!git status

/content/model_gec
On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [5]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.64.3-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wandb>=0.10.32 (from simpletransformers)
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m23.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting streamlit (from simpletransformers)
  Downloading

In [8]:
import wandb
from utils import Config, Training_config
import pickle

WANDB_PROJECT_NAME = "langbot_gec"
ECC_TRAIN_CONFIG = 'cfgs/beto_2classes.py'
train_args = Training_config(ECC_TRAIN_CONFIG)

In [None]:
def load_model_from_wandb(project_name, args):
    MODEL_LOC = f'outputs/{args.exp_name}/{args.exp_name}.pkl'
    api = wandb.Api()
    runs = api.runs(
        path=f'langbot/{project_name}',
        filters={'config.wandb_kwargs.name': args.exp_name},
        order="-created_at")
    print(f'Found {len(runs)} runs under {project_name}/{args.exp_name}')
    last_run = runs[0]
    model_file = last_run.file(MODEL_LOC)
    print(f'Downloading {MODEL_LOC}')
    model_file.download(replace=True)

    with open(MODEL_LOC, 'rb') as f:  # open a text file
        reloaded = pickle.load(f) # serialize the list
    return reloaded

In [9]:
# Load pickled pre-trained model from google drive
def testLoading():
    reloaded = load_model_from_wandb(WANDB_PROJECT_NAME, train_args)
    model_outputs, predictions = reloaded.predict(["Voy a ver un montón de gente."])
    print(f'model_outputs: {model_outputs}')
    print(f'predictions: {predictions}')

testLoading()

Found 4 runs under langbot_gec/beto_cows_l2h_two_classes
Downloading outputs/beto_cows_l2h_two_classes/beto_cows_l2h_two_classes.pkl


  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

model_outputs: [[{'Voy': 'O'}, {'a': 'O'}, {'ver': 'O'}, {'un': 'O'}, {'montón': 'B-ga'}, {'de': 'O'}, {'gente.': 'O'}]]
predictions: [[{'Voy': [[-0.0932, -1.502, -0.6562, -2.14, 4.09]]}, {'a': [[0.04865, -1.576, -0.4631, -2.178, 3.197]]}, {'ver': [[0.928, -0.541, -0.893, -1.905, 2.322]]}, {'un': [[0.05115, -1.65, -0.1198, -1.408, 2.076]]}, {'montón': [[1.921, -1.125, 0.253, -1.291, 0.3008]]}, {'de': [[0.4702, 0.147, -0.9663, -1.299, 1.625]]}, {'gente.': [[0.752, -0.057, -0.922, -2.01, 2.6], [0.6055, -0.6123, -1.263, -2.197, 3.844]]}]]


In [None]:
model = load_model_from_wandb(WANDB_PROJECT_NAME, train_args)

In [18]:
lines = ["Estoy bienes, gracias.",
         "Sí, tengo algo de tiempos hoy.",
         "Sí, necesito comprar uno chaqueta.",
         "A los diez.",
         "Hastas luego."]
model_outputs, predictions = model.predict(lines)
[print(op) for op in model_outputs]
print("")
[print(pred) for pred in predictions]


  0%|          | 0/5 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[{'Estoy': 'O'}, {'bienes,': 'O'}, {'gracias.': 'O'}]
[{'Sí,': 'O'}, {'tengo': 'O'}, {'algo': 'O'}, {'de': 'O'}, {'tiempos': 'O'}, {'hoy.': 'O'}]
[{'Sí,': 'O'}, {'necesito': 'O'}, {'comprar': 'O'}, {'uno': 'O'}, {'chaqueta.': 'O'}]
[{'A': 'O'}, {'los': 'O'}, {'diez.': 'B-na'}]
[{'Hastas': 'O'}, {'luego.': 'O'}]

[{'Estoy': [[-0.318, -2.115, 0.2556, -1.109, 3.256]]}, {'bienes,': [[0.1362, -1.846, 0.1792, -1.328, 3.105], [0.2218, -2.156, 0.4514, -0.618, 2.41]]}, {'gracias.': [[-0.029, -1.308, -1.044, -0.8457, 3.941], [0.4526, -2.37, 0.2466, -1.577, 3.82]]}]
[{'Sí,': [[-0.3533, -0.7466, -0.3884, -1.009, 2.688], [0.3232, -0.6133, -0.6567, -1.175, 1.9375]]}, {'tengo': [[-1.039, -1.16, -0.2534, -0.9014, 1.222]]}, {'algo': [[-0.3894, -0.3003, 0.1295, -0.5586, 0.7017]]}, {'de': [[-0.883, -0.1818, -0.2098, 0.2462, 0.939]]}, {'tiempos': [[-0.5176, -0.1959, -0.4763, -0.512, 1.949]]}, {'hoy.': [[-0.6084, -0.704, -0.4253, -0.708, 2.193], [-0.1956, -0.5024, -0.555, -0.8784, 2.572]]}]
[{'Sí,': [[0.66

[None, None, None, None, None]