Using kernel `conda_pytorch_latest_p36`

In [1]:
# !pip install cloudpathlib
# !pip install s3fs
# !pip install transformers\
# !pip install pytorch-lightning

# Import

In [1]:
import sys
sys.path.append('../../../')

In [2]:
from pathlib import Path
import os
import random
import json
from datetime import datetime

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report
import torch
import sagemaker
from sagemaker import get_execution_role
from sagemaker.pytorch import PyTorch
import boto3
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader,Dataset
from tqdm import tqdm
from langdetect import detect

In [4]:
from deep.constants import *
from deep.utils import *

In [5]:
%load_ext autoreload
%autoreload 2

## Data

In [6]:
def preprocessing(df):
    df = df.copy()
    df['sectors'] = df['sectors'].apply(eval)    
    df['pillars'] = df['pillars'].apply(eval)
    df['pillars'] = df['pillars'].apply(lambda x: list(set(x)))
    df['subpillars'] = df['subpillars'].apply(eval)
    return df

In [7]:
train = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_train.csv'))
val = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_val.csv'))
test = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.4_test.csv'))

## Sagemaker Class

In [8]:
class SageMakerRun:
    
    def __init__(
            self, 
            job_end_name, 
            epochs,
            train_batch_size,
            learning_rate,
            text_col,
            label_col,
            instance_type='ml.p3.2xlarge',
        ):
        self.sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
        self.role = 'AmazonSageMaker-ExecutionRole-20210519T102514'
        self.instance_type = instance_type
    
        self.job_name = f"pytorch-{formatted_time()}-{job_end_name}"
        self.input_path = DEV_BUCKET / 'training' / 'input_data' / self.job_name
        
        self.hyperparameters={
            'epochs': epochs,
            'train_batch_size': train_batch_size,
            'learning_rate': learning_rate,
            'text_col': text_col,
            'label_col': label_col,
        }
        self.fit_arguments = {
            'train': str(self.input_path),
            'test': str(self.input_path)
        }
        
    def upload_bucket(self, df):
        s3_data = str(self.input_path / 'df.pickle')
        df.to_pickle(s3_data, protocol=4)
        
    def get_estimator(self):
        estimator = PyTorch(
            entry_point='train.py',
            source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/multiclass-fastai'),
            output_path=str(DEV_BUCKET / 'models/'),
            code_location=str(self.input_path),
            instance_type=self.instance_type,
            instance_count=1,
            role=self.role,
            framework_version='1.8',
            py_version='py36',
            hyperparameters = self.hyperparameters,
            job_name=self.job_name,
        )
        return estimator
        
    def run(self, df):
        self.upload_bucket(df)
        estimator = self.get_estimator()
        estimator.fit(self.fit_arguments, job_name=self.job_name, wait=False)


## Pillars classification

In [9]:
column = 'pillars'
classes = PILLARS
text_column = 'excerpt'
label_column = 'pillars'

In [10]:
def process_multiclass(df, train, column, classes, only_english=True):
    relevant_train = df.copy()
    relevant_train = relevant_train[relevant_train[column].apply(len) > 0]
    relevant_train[label_column] = relevant_train[label_column].apply(lambda x: ';'.join(x))
    
    relevant_train['is_valid'] = False if train else True
    
    if only_english:
        relevant_train = relevant_train[relevant_train['language'] == 'en']
        
    return relevant_train
    
    

In [11]:
train_df = process_multiclass(train, True, column=column, classes=classes, only_english=True)
val_df = process_multiclass(val, False, column=column, classes=classes, only_english=True)
df = pd.concat([train_df, val_df])

In [12]:
df.shape

(29669, 17)

In [13]:
val_df.shape

(2803, 17)

### Job

In [68]:
sagemaker_pillars = SageMakerRun(
    job_end_name='pillars-en', 
    epochs=3,
    train_batch_size=64,
    learning_rate=0.02,
    text_col=text_column,
    label_col=label_column,
    instance_type='ml.p3.2xlarge',
)

In [69]:
sagemaker_pillars.run(df)

## Subpillar runs

In [164]:
subpillars_to_exclude = [
    'Capacities & Response->Response Gaps',
    'People At Risk->Number Of People At Risk',
]

In [165]:
SUBPILLARS_FILTERED = [x for x in SUBPILLARS if x not in subpillars_to_exclude]

In [166]:
pillar_to_subpillars = {
    pillar: [subpillar for subpillar in SUBPILLARS_FILTERED if subpillar.startswith(pillar)]
    for pillar in PILLARS
}

In [167]:
def process_pillar_subpillars(df, train, subpillars, label_col='subpillars', only_english=True):
    relevant_train = df.copy()
    
    if not train:
        relevant_train = relevant_train[relevant_train[label_col].apply(len) > 0]
        
    relevant_train[label_col] = relevant_train[label_col].apply(
        lambda x: [y for y in x if y in subpillars]
    )
    
    if train:
        relevant_train = relevant_train[relevant_train[label_col].apply(len) > 0]
    relevant_train[label_col] = relevant_train[label_col].apply(lambda x: ';'.join(x))
    
    relevant_train['is_valid'] = False if train else True
    
    if only_english:
        relevant_train = relevant_train[relevant_train['language'] == 'en']
        
    return relevant_train
    
    

### Hum Condition

In [168]:
subpillar_label_col = 'subpillars'

In [169]:
hum_condition_train_df = process_pillar_subpillars(train, True, pillar_to_subpillars['Humanitarian Conditions'])
hum_condition_val_df = process_pillar_subpillars(val, False, pillar_to_subpillars['Humanitarian Conditions'])
hum_condition_df = pd.concat([hum_condition_train_df, hum_condition_val_df])

In [170]:
sagemaker_hum_cond= SageMakerRun(
    job_end_name='hum-condition-en', 
    epochs=3,
    train_batch_size=64,
    learning_rate=0.02,
    text_col=text_column,
    label_col=subpillar_label_col,
    instance_type='ml.g4dn.xlarge',
)

In [171]:
sagemaker_hum_cond.run(hum_condition_df)

### Cap Response

In [172]:
cap_response_train_df = process_pillar_subpillars(train, True, pillar_to_subpillars['Capacities & Response'])
cap_response_val_df = process_pillar_subpillars(val, False, pillar_to_subpillars['Capacities & Response'])
cap_response_df = pd.concat([cap_response_train_df, cap_response_val_df])

In [173]:
sagemaker_cap_response = SageMakerRun(
    job_end_name='cap-response-en', 
    epochs=3,
    train_batch_size=64,
    learning_rate=0.02,
    text_col=text_column,
    label_col=subpillar_label_col,
    instance_type='ml.g4dn.xlarge',
)
sagemaker_cap_response.run(cap_response_df)

### Impact

In [174]:
impact_train_df = process_pillar_subpillars(train, True, pillar_to_subpillars['Impact'])
impact_val_df = process_pillar_subpillars(val, False, pillar_to_subpillars['Impact'])
impact_df = pd.concat([impact_train_df, impact_val_df])

In [175]:
sagemaker_impact = SageMakerRun(
    job_end_name='impact-en', 
    epochs=3,
    train_batch_size=64,
    learning_rate=0.02,
    text_col=text_column,
    label_col=subpillar_label_col,
    instance_type='ml.p3.2xlarge',
)
sagemaker_impact.run(impact_df)

### Priority Interventions

In [176]:
priority_int_train_df = process_pillar_subpillars(train, True, pillar_to_subpillars['Priority Interventions'])
priority_int_val_df = process_pillar_subpillars(val, False, pillar_to_subpillars['Priority Interventions'])
priority_int_df = pd.concat([priority_int_train_df, priority_int_val_df])

In [177]:
sagemaker_priority_int = SageMakerRun(
    job_end_name='priority-int-en', 
    epochs=3,
    train_batch_size=64,
    learning_rate=0.02,
    text_col=text_column,
    label_col=subpillar_label_col,
    instance_type='ml.p3.2xlarge',
)
sagemaker_priority_int.run(priority_int_df)

### People Risk

In [178]:
people_risk_train_df = process_pillar_subpillars(train, True, pillar_to_subpillars['People At Risk'])
people_risk_val_df = process_pillar_subpillars(val, False, pillar_to_subpillars['People At Risk'])
people_risk_df = pd.concat([people_risk_train_df, people_risk_val_df])

In [179]:
sagemaker_people_risk = SageMakerRun(
    job_end_name='people-risk-en', 
    epochs=3,
    train_batch_size=64,
    learning_rate=0.02,
    text_col=text_column,
    label_col=subpillar_label_col,
    instance_type='ml.p2.xlarge',
)
sagemaker_people_risk.run(people_risk_df)

### Priority Needs

In [180]:
priority_needs_train_df = process_pillar_subpillars(train, True, pillar_to_subpillars['Priority Needs'])
priority_needs_val_df = process_pillar_subpillars(val, False, pillar_to_subpillars['Priority Needs'])
priority_needs_df = pd.concat([priority_needs_train_df, priority_needs_val_df])

In [182]:
sagemaker_priority_needs = SageMakerRun(
    job_end_name='priority-needs-en', 
    epochs=3,
    train_batch_size=64,
    learning_rate=0.02,
    text_col=text_column,
    label_col=subpillar_label_col,
    instance_type='ml.p3.2xlarge',
)
sagemaker_priority_needs.run(priority_needs_df)

In [125]:
PILLARS

['Humanitarian Conditions',
 'Capacities & Response',
 'Impact',
 'Priority Interventions',
 'People At Risk',
 'Priority Needs']

## Sagemaker Prep

### Session

In [11]:
sess = sagemaker.Session(default_bucket=DEV_BUCKET.name)
role = 'AmazonSageMaker-ExecutionRole-20210519T102514'

### Bucket upload

In [12]:
sample = False

if sample:
    df = df.sample(100)
    
job_name = f"pytorch-{formatted_time()}-fastai-5ep-english"
input_path = DEV_BUCKET / 'training' / 'input_data' / job_name

s3_data = str(input_path / 'df.pickle')


df.to_pickle(s3_data, protocol=4)

### Estimator Definition

In [13]:
instances = [
    'ml.p2.xlarge',
    'ml.p3.2xlarge',
    'ml.g4dn.xlarge'
]

In [14]:
input_path

S3Path('s3://sagemaker-deep-experiments-dev/training/input_data/pytorch-2021-06-07-11-32-34-795-fastai-5ep-english')

In [15]:

hyperparameters={
    'epochs': 3,
    'train_batch_size': 64,
    'learning_rate': 0.02,
    'text_col': text_column,
    'label_col': label_column,
}

estimator = PyTorch(
    entry_point='train.py',
    source_dir=str(SCRIPTS_TRAINING_PATH / 'stefano/multiclass-fastai'),
    output_path=str(DEV_BUCKET / 'models/'),
    code_location=str(input_path),
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    role=role,
    framework_version='1.8',
    py_version='py36',
    hyperparameters = hyperparameters,
    job_name=job_name,
#     train_instance_count=2,
#     train_instance_type="ml.c4.xlarge",
)

In [16]:
fit_arguments = {
    'train': str(input_path),
    'test': str(input_path)
}

In [17]:
estimator.fit(fit_arguments, job_name=job_name, wait=False)

In [None]:
from sagemaker import TrainingJobAnalytics

# Captured metrics can be accessed as a Pandas dataframe
df = TrainingJobAnalytics(training_job_name=estimator.latest_training_job.name).dataframe()
df.head(10)

In [None]:
df