Using kernel `conda_pytorch_latest_p36`

We take inspiration from [this paper](https://arxiv.org/pdf/2104.14690.pdf)

In [1]:
# !pip install cloudpathlib
# !pip install s3fs
# !pip install transformers\
# !pip install pytorch-lightning

# Import

In [2]:
import sys
sys.path.append('../../../')

In [3]:
from pathlib import Path
import os
import random
import json
from datetime import datetime

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
import sagemaker
from sagemaker import get_execution_role
import boto3
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm
from langdetect import detect

In [5]:
from deep.constants import *
from deep.utils import *

In [6]:
%load_ext autoreload
%autoreload 2

## Data

In [7]:
def preprocessing(df):
    df = df.copy()
    df['sectors'] = df['sectors'].apply(eval)    
    df['pillars'] = df['pillars'].apply(eval)
    df['pillars'] = df['pillars'].apply(lambda x: list(set(x)))
    df['subpillars'] = df['subpillars'].apply(eval)
    return df

In [9]:
column = 'subpillars'
classes=SUBPILLARS
text_column = 'excerpt'
entail_text_column = 'entail_excerpt'
label_column = 'entail_labels'

In [11]:
train = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_train.csv', index_col=0))
val = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_val.csv', index_col=0))
test = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_test.csv', index_col=0))

In [13]:
def process_entailment(df, train, column, classes, only_en=True, drop_empty=True):
    df = df.copy()
    
    if only_en:
        df = df[df['language'] == 'en']
    if drop_empty:
        df = df[df.pillars.apply(len) > 0]
    
    df[entail_text_column] = [
        [x + f' This sentence is about {y.split("->")[1]}.' for y in classes] 
        for x in df[text_column]
    ]
    df[label_column] = [[1 if z in y else 0 for z in classes] for y in df[column]]
    df = df.set_index(
        ['entry_id', text_column]
    )[[entail_text_column, label_column]].apply(pd.Series.explode).reset_index()
    
    df['is_valid'] = False if train else True
        
    return df
    
    

In [14]:
train_df = process_entailment(train, True, column=column, classes=classes)
test_df = process_entailment(test, False, column=column, classes=classes)
df = pd.concat([train_df, test_df])

In [17]:
# # tmp = pd.DataFrame(train_df.pillars.explode().value_counts())
# sampling_rate = (float(tmp.max()) / tmp).to_dict()['pillars']

In [18]:
# dfs = []
# for c, rate in sampling_rate.items():
#     sampled = train_df[train_df.pillars.apply(lambda x: c in x)].sample(frac=rate, replace=True)
#     dfs.append(sampled)

In [19]:
# t_df = pd.concat(dfs)
# df = pd.concat([t_df, test_df])
# df = df.reset_index(drop=True)

## Sagemaker Prep

### Session

In [20]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = 'AmazonSageMaker-ExecutionRole-20210519T102514'

### Bucket upload

In [21]:
sample = False

if sample:
    df = df.sample(100)
    
job_name = f"pytorch-{formatted_time()}-fastai-entailsubpil-english"
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name

s3_data = str(input_path / 'df.pickle')


df.to_pickle(s3_data, protocol=4)

### Estimator Definition

In [22]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [23]:
input_path

S3Path('s3://sagemaker-deep-experiments-dev/training/input_data/pytorch-2021-06-09-10-29-28-880-fastai-entailsubpil-english')

In [24]:
from sagemaker.pytorch import PyTorch

hyperparameters={
    'epochs': 3,
    'train_batch_size': 64,
    'learning_rate': 0.02,
    'text_col': entail_text_column,
    'label_col': label_column,
    'multi_category': 0
}

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/multiclass-fastai'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [25]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [26]:
estimator.fit(fit_arguments, job_name=job_name, wait=False)

In [None]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

In [None]:
df