Using kernel `conda_pytorch_latest_p36`

We take inspiration from [this paper](https://arxiv.org/pdf/2104.14690.pdf)

In [1]:
# !pip install cloudpathlib
# !pip install s3fs
# !pip install transformers\
# !pip install pytorch-lightning

# Import

In [2]:
import sys
sys.path.append('../../../')

In [3]:
from pathlib import Path
import os
import random
import json
from datetime import datetime

In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
import sagemaker
from sagemaker import get_execution_role
import boto3
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm
from langdetect import detect

In [5]:
from deep.constants import *
from deep.utils import *

In [6]:
%load_ext autoreload
%autoreload 2

## Data

In [17]:
final_columns = ['entry_id', 'lead_id', 'project_id', 'project_title',
       'analysis_framework_id', 'excerpt', 'dropped_excerpt', 'created_by_id',
       'modified_by_id', 'verified', 'verification_last_changed_by_id', 'language',
       'sectors', 'pillars', 'subpillars']

In [18]:
def preprocessing(df):
    df = df.copy()
    df['language'] = [detect(x) for x in tqdm(df.excerpt)]
    df = df[final_columns]
        
    return df
        
    df['sectors'] = df['sectors'].apply(eval)    
    df['pillars'] = df['pillars'].apply(eval)
    df['pillars'] = df['pillars'].apply(lambda x: list(set(x)))
    df['subpillars'] = df['subpillars'].apply(eval)
    return df

In [8]:
column = 'subpillars'
classes=SUBPILLARS
text_column = 'excerpt'
label_column = 'merged'

In [19]:
train = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.3_train.csv'))
val = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.3_val.csv'))
test = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.3_test.csv'))

100%|██████████| 90653/90653 [05:06<00:00, 295.93it/s]
100%|██████████| 10073/10073 [00:34<00:00, 291.50it/s]
100%|██████████| 11192/11192 [00:39<00:00, 284.22it/s]


In [22]:
new_data_path = LATEST_DATA_PATH.parent / ''

PosixPath('/Users/stefano/Dropbox/Work/DFS/Code/deep-experiments/notebooks/models/stefano/../../../data/frameworks_data/data_v0.4.3')

In [20]:
train

Unnamed: 0,entry_id,lead_id,project_id,project_title,analysis_framework_id,excerpt,dropped_excerpt,created_by_id,modified_by_id,verified,verification_last_changed_by_id,language,sectors,pillars,subpillars
0,163664,35315,2028,IMMAP/DFS Syria,1306,Market monitoring by the World Food Programme ...,,2232,2232,False,,en,['Food Security'],['Impact'],['Impact->Impact On Systems And Services']
1,162812,37820,2098,IMMAP/DFS Bangladesh,1306,Quarantine Facilities: ninety-three shelters i...,,657,2233,False,,en,['Health'],['Capacities & Response'],['Capacities & Response->International Response']
2,164560,39796,2098,IMMAP/DFS Bangladesh,1306,"Within dimensions, markets are broadly operati...",,1152,1152,False,,en,['Cross'],['Impact'],['Impact->Impact On Systems And Services']
3,157496,38706,2098,IMMAP/DFS Bangladesh,1306,Frontline aid workers face a heightened risk o...,,2233,2233,False,,en,"['Health', 'Logistics']","['Impact', 'People At Risk', 'Impact', 'Impact']","['Impact->Driver/Aggravating Factors', 'People..."
4,170866,37673,1142,IFRC Turkey,699,[COVID] TRC is currently using its different c...,,2233,2233,False,,en,['Health'],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90648,282949,51241,2170,IMMAP/DFS Nigeria,1306,"[16th Mar 2021,North east Nigeria]The governme...",,2230,26,True,26.0,en,['Cross'],['Impact'],['Impact->Impact On Systems And Services']
90649,283375,51241,2170,IMMAP/DFS Nigeria,1306,"[16th Mar 2021,North east Nigeria] Impact on s...",,2230,1152,True,1152.0,en,"['Health', 'Education', 'Protection', 'Livelih...","['Impact', 'Humanitarian Conditions']","['Impact->Driver/Aggravating Factors', 'Humani..."
90650,268927,49888,2331,GIMAC Somalia,1465,A reported 14 per cent of women aged 15-49 had...,,2741,2272,True,488.0,en,['Protection'],[],[]
90651,268842,49945,2311,IMMAP/DFS Colombia,1306,La alternancia no es solo plantear cuáles niño...,La alternancia no es solo plantear cuáles niño...,2374,2374,True,1403.0,es,['Education'],['Impact'],['Impact->Impact On Systems And Services']


In [9]:
def process_multiclass(df, train, column='pillars', classes=PILLARS, only_english=True):
    relevant_train = df.copy()
    relevant_train = relevant_train[relevant_train[column].apply(len) > 0]
    relevant_train[label_column] = relevant_train.pillars + relevant_train.subpillars
    relevant_train[label_column] = relevant_train[label_column].apply(lambda x: ';'.join(x))
    
    relevant_train['is_valid'] = False if train else True
    
    if not only_english:
        relevant_train['language'] = [detect(x) for x in tqdm(relevant_train.excerpt)]
        relevant_train = relevant_train[relevant_train['language'] == 'en']
        
    return relevant_train
    
    

In [10]:
train_df = process_multiclass(train, True, column=column, classes=classes, only_english=False)
test_df = process_multiclass(test, False, column=column, classes=classes, only_english=False)
df = pd.concat([train_df, test_df])

100%|██████████| 49319/49319 [02:57<00:00, 277.17it/s]
100%|██████████| 6516/6516 [00:23<00:00, 275.32it/s]


In [None]:
tmp = pd.DataFrame(train_df.pillars.explode().value_counts())
sampling_rate = (float(tmp.max()) / tmp).to_dict()['pillars']

In [None]:
dfs = []
for c, rate in sampling_rate.items():
    sampled = train_df[train_df.pillars.apply(lambda x: c in x)].sample(frac=rate, replace=True)
    dfs.append(sampled)

In [None]:
t_df = pd.concat(dfs)
df = pd.concat([t_df, test_df])
df = df.reset_index(drop=True)

## Sagemaker Prep

### Session

In [11]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = 'AmazonSageMaker-ExecutionRole-20210519T102514'

### Bucket upload

In [12]:
sample = False

if sample:
    df = df.sample(100)
    
job_name = f"pytorch-{formatted_time()}-fastai-5ep-english"
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name

s3_data = str(input_path / 'df.pickle')


df.to_pickle(s3_data, protocol=4)

### Estimator Definition

In [13]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge'
]

In [14]:
input_path

S3Path('s3://sagemaker-deep-experiments-dev/training/input_data/pytorch-2021-06-07-11-32-34-795-fastai-5ep-english')

In [15]:
from sagemaker.pytorch import PyTorch

hyperparameters={
    'epochs': 3,
    'train_batch_size': 64,
    'learning_rate': 0.02,
    'text_col': text_column,
    'label_col': label_column,
}

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/multiclass-fastai'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [16]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [17]:
estimator.fit(fit_arguments, job_name=job_name, wait=False)

In [None]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

In [None]:
df