# Import

In [None]:
import mlflow
logged_model = 's3://deep-mlflow-artifact/2/9f216acf38d54ff6b185441a0f80e8b7/artifacts/model'

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

In [1]:
import sys
sys.path.append('../../')

import os
import sys
import logging
import argparse
from pathlib import Path
from ast import literal_eval
from collections import Counter
from typing import Any, Dict, Optional

In [2]:
from icecream import ic
from tqdm.auto import tqdm

import torchmetrics
from torchmetrics.functional import accuracy, f1, auroc

import sagemaker
from sagemaker import get_execution_role
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.core.decorators import auto_move_data
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import MLFlowLogger


import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix


import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc

import transformers
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    T5ForConditionalGeneration,
    T5Tokenizer,
    AutoTokenizer,
)
from transformers.optimization import (
    Adafactor,
    get_linear_schedule_with_warmup,
)

Local constants, regarding the data, MLFlow server, paths, etc..: use them

In [3]:
sys.path.append('../../scripts/examples/sector-pl/')
from data import SectorsDataset
from model import SectorsTransformer

In [4]:
from deep.constants import *
from deep.utils import *

# Data

In [6]:
import boto3
 
def check_status(app_name):
    sage_client = boto3.client('sagemaker', region_name='us-east-1')
    endpoint_description = sage_client.describe_endpoint(EndpointName=app_name)
    endpoint_status = endpoint_description["EndpointStatus"]
    return endpoint_status
 
print("Application status is: {}".format(check_status('prova5')))

Application status is: InService


In [7]:
import json
 
def query_endpoint(app_name, input_json):
    client = boto3.session.Session().client("sagemaker-runtime", 'us-east-1')

    response = client.invoke_endpoint(
      EndpointName=app_name,
      Body=input_json,
      ContentType='application/json; format=pandas-split',
    )
    preds = response['Body'].read().decode("ascii")
    preds = json.loads(preds)
    print("Received response: {}".format(preds))
    return preds

In [9]:
import pandas as pd
test_data = pd.DataFrame(
    {
        "text": ["My name is Alex, I am 32 and live in Copenhagen."] * 3,
        "question": ["What is my name?", "How old am I?", "Where do I live?"],
    }
)

In [3]:
import pkg_resources
env = dict(tuple(str(ws).split()) for ws in pkg_resources.working_set)

In [5]:
train_dataset = pd.read_csv(LATEST_DATA_PATH / "data_v0.5_train.csv")
val_dataset = pd.read_csv(LATEST_DATA_PATH / "data_v0.5_val.csv")
##
train_dataset["sectors"] = train_dataset["sectors"].apply(literal_eval)
val_dataset["sectors"] = val_dataset["sectors"].apply(literal_eval)
class_to_id = {class_: i for i, class_ in enumerate(SECTORS)}

In [6]:
model_name = 'sentence-transformers/paraphrase-mpnet-base-v2'

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
training_set = SectorsDataset(train_dataset, class_to_id, tokenizer, 200)
val_set = SectorsDataset(val_dataset, class_to_id, tokenizer, 200)

In [8]:
train_params = {"batch_size": 16, "shuffle": True, "num_workers": 0}
val_params = {"batch_size": 16, "shuffle": False, "num_workers": 0}

training_loader = DataLoader(training_set, **train_params)
logging.info(training_loader.dataset)
val_loader = DataLoader(val_set, **val_params)

In [9]:
training_loader.dataset

<data.SectorsDataset at 0x7f9be57143d0>

In [10]:
trainer = pl.Trainer(
#     logger=logger,
#     callbacks=[early_stopping_callback, checkpoint_callback],
    gpus=0,
    max_epochs=2,
    # overfit_batches=1,
    # limit_predict_batches=2,
    # limit_test_batches=2,
    # fast_dev_run=True,
    # limit_train_batches=1,
    # limit_val_batches=1,
    # limit_test_batches: Union[int, float] = 1.0,
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


In [11]:
empty_dataset = SectorsDataset(None, class_to_id, tokenizer, 200)
model = SectorsTransformer(
    model_name,
    len(class_to_id),
    empty_dataset,
    training_loader,
    gpus=1,
    precision=16,
    plugin="deepspeed_stage_3_offload",
    accumulate_grad_batches=1,
    max_epochs=2,
)

In [12]:
trainer.fit(model, training_loader, val_loader)


  | Name           | Type  | Params
-----------------------------------------
0 | model          | Model | 109 M 
1 | f1_score_train | F1    | 0     
-----------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.980   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]



KeyboardInterrupt: 