Using kernel `conda_pytorch_latest_p36`

In [1]:
# !pip install fastai

In [2]:
classes = [
    'Agricolture',
    'Cross',
    'Education',
    'Food Security',
    'Health',
    'Livelihoods',
    'Logistics',
    'Nutrition',
    'Protection',
    'Shelter',
    'Wash'
]

In [3]:
import torch

In [4]:
from pathlib import Path
import os
import random
import pickle

In [5]:
from fastai.text.all import *

In [6]:
data = Path('data_prep/final_data/en/')
raw = pd.read_csv('data_prep/data/entries_raw.csv')

In [7]:
train = pd.read_csv(data / 'sentences_en_train.csv')
test = pd.read_csv(data / 'sentences_en_test.csv')

In [8]:
def process_for_sector(df, sector, train):
    relevant_train = df[df.is_relevant == 1]
    relevant_train.sector_ids = relevant_train.sector_ids.apply(eval)
    relevant_train = relevant_train[relevant_train.sector_ids.apply(len) > 0]
    
    positive_train = relevant_train[relevant_train.sector_ids.apply(lambda x: sector in x)]
    negative_train = relevant_train[relevant_train.sector_ids.apply(lambda x: sector not in x)]

    positive_train.sector_ids = 1
    negative_train.sector_ids = 0
    train_df = pd.concat([positive_train, negative_train])
    train_df = train_df.sample(frac=1).reset_index(drop=True)
    
    train_df['is_valid'] = False if train else True
        
    return train_df

# Use fastai

In [None]:
sector = 0
train_df = process_for_sector(train, sector)
test_df = process_for_sector(test, sector)

In [None]:
test_df

In [None]:
learn.fine_tune(1, 1e-2)

In [None]:
learn.recorder.metrics[1]

In [None]:
pred = learn.get_preds()

In [None]:
base = Path('/home/ec2-user/SageMaker/experiments-dfs/models/fastai')

In [None]:
for sector in range(11):
    print(f'Processing sector {sector}')
    train_df = process_for_sector(train, sector, train=True)
    test_df = process_for_sector(test, sector, train=False)
    df = pd.concat([train_df, test_df])
    print(train_df.sector_ids.describe())
    print(test_df.sector_ids.describe())
    # Tokenize the dataframe created above to have all the descriptions tokenized properly and build a dataloader
    # For creating a language model
    dls = TextDataLoaders.from_df(
        df,
        text_col='sentence_text',
        label_col='sector_ids',
        valid_col='is_valid',
        is_lm = False,    # Mention explicitly that this dataloader is meant for language model
        seq_len = 72,     # Pick a sequence length i.e. how many words to feed through the RNN at once
        bs = 64,     # Specify the batch size for the dataloader
    )
    learn = text_classifier_learner(
        dls, 
        AWD_LSTM, 
        drop_mult=0.5, 
        metrics=[accuracy, Recall(), Precision(), F1Score()]
    )
    learn.fine_tune(4, 1e-2)
    pred = learn.get_preds()
    with open(base / f'{sector}.pickle', 'wb') as f:
        pickle.dump(pred, f)