Using kernel `conda_pytorch_latest_p36`

# Import

In [1]:
# !pip install transformers
# !pip install datasets

In [2]:
import sys
sys.path.append('../../../')

In [3]:
from pathlib import Path
import os
import random
import pickle

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from sklearn.metrics import classification_report, precision_recall_fscore_support, precision_score, f1_score, recall_score
from sklearn.preprocessing import MultiLabelBinarizer
import torch

In [5]:
from deep.constants import *

# Data

In [12]:
def preprocessing(df):
    df = df.copy()
    df['sectors'] = df['sectors'].apply(eval)    
    df['pillars'] = df['pillars'].apply(eval)
    df['subpillars'] = df['subpillars'].apply(eval)
    return df

In [13]:
train = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.2_train.csv'))
val = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.2_val.csv'))
test = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.2_test.csv'))

def process(df):
    df = df.copy()
    
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(list(df['sectors']))
    df['labels'] = list(labels)
    
    df = df[['excerpt', 'labels']]
    df = df.rename(columns={'excerpt': 'texts'})
        
    return df

train_df = process(train)
val_df = process(val)
test_df = process(test)

In [14]:
train_df

Unnamed: 0,texts,labels
0,Dispatch and Referral Unit (DRU): IOM continue...,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
1,Market monitoring by the World Food Programme ...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
2,The RIRT coordination system has been formaliz...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
3,"Within dimensions, markets are broadly operati...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
4,A total of 269 awareness sessions and sensitiz...,"[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0]"
...,...,...
90648,"[16th Mar 2021,North east Nigeria]The governme...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
90649,"[16th Mar 2021,North east Nigeria] Impact on s...","[0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0]"
90650,A reported 14 per cent of women aged 15-49 had...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
90651,La alternancia no es solo plantear cuáles niño...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [68]:
def get_freq_matrix(df):
    t = df.explode('pillars').explode('subpillars')
    t = t.groupby(['pillars', 'subpillars']).count()[['sectors']]
    freq = t.unstack(level=0)['sectors'].fillna(0).astype(int)
    return freq

In [69]:
freq = get_freq_matrix(train)
freq_val = get_freq_matrix(val)
freq_test = get_freq_matrix(test)

In [82]:
t = train[train.pillars.apply(len)==1]

In [115]:
matching = pd.read_excel('/Users/stefano/Downloads/Pillars and Subpillars Matching.xlsx', sheet_name='Matching - duplicate')
matching = matching[['Final Pillar Name', 'Final Sub-pillar Name']]
matching.columns = ['pillars', 'subpillars']
matching['ones'] = 1
matching.pillars = matching.pillars.str.replace('Humanitatian conditions', 'Humanitarian Conditions')
matching.pillars = matching.pillars.str.replace('impact', 'Impact')
matching_freq = matching.groupby(['pillars', 'subpillars']).sum().unstack(level=0)['ones']

In [126]:
left = ['Impact','People at risk','Priority needs']
removed = ['Capacities & Response','Humanitarian Conditions','Priority interventions']

In [130]:
list(matching_freq[removed].dropna(axis=0, how='all').index)

['Coping mechanisms',
 'Expressed by humanitarian staff',
 'Expressed by population',
 'International response',
 'Living standards',
 'National response',
 'Number of People in Need',
 'Number of people reached',
 'Physical and mental well being',
 'Response gaps']

In [124]:
matching_freq[left].dropna(axis=0, how='all')

pillars,Impact,People at risk,Priority needs
subpillars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Driver/aggravating factors,10.0,,
Expressed by humanitarian staff,,,3.0
Expressed by population,,,5.0
Impact on people,1.0,,
Impact on people or impact on services,13.0,,
Impact on services,14.0,,
Impact on systems and services,3.0,,
Number of people affected,4.0,,
Number of people at risk,,2.0,
Risk and vulnerabilities,,12.0,


In [86]:
get_freq_matrix(t)

pillars,Impact,People At Risk,Priority Needs
subpillars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Coping Mechanisms,0,1724,0
Driver/Aggravating Factors,3705,0,0
Expressed By Humanitarian Staff,0,0,1699
Expressed By Population,0,0,737
Impact On People,3217,0,0
Impact On People Or Impact On Services,854,0,0
Impact On Services,1617,0,0
Impact On Systems And Services,4157,0,0
International Response,3302,0,0
Living Standards,0,10724,0


In [81]:
freq

pillars,Impact,People At Risk,Priority Needs
subpillars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Coping Mechanisms,936,3036,119
Driver/Aggravating Factors,9790,5914,345
Expressed By Humanitarian Staff,313,417,2267
Expressed By Population,143,383,1164
Impact On People,5192,1975,151
Impact On People Or Impact On Services,2243,1160,32
Impact On Services,4147,1761,116
Impact On Systems And Services,5786,1433,108
International Response,4083,367,28
Living Standards,6056,19309,1409


In [71]:
freq_val

pillars,Impact,People At Risk,Priority Needs
subpillars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Coping Mechanisms,96,466,20
Driver/Aggravating Factors,1072,698,61
Expressed By Humanitarian Staff,29,64,256
Expressed By Population,14,54,116
Impact On People,577,243,11
Impact On People Or Impact On Services,172,51,15
Impact On Services,323,125,35
Impact On Systems And Services,667,179,9
International Response,458,28,5
Living Standards,614,2168,153


In [72]:
freq_test

pillars,Impact,People At Risk,Priority Needs
subpillars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Coping Mechanisms,94,399,12
Driver/Aggravating Factors,1190,743,75
Expressed By Humanitarian Staff,34,49,252
Expressed By Population,34,81,140
Impact On People,642,281,21
Impact On People Or Impact On Services,287,74,17
Impact On Services,480,104,31
Impact On Systems And Services,727,224,11
International Response,497,25,5
Living Standards,672,2427,195


In [57]:
freq.sum(axis=0)

pillars
Impact            51833
People At Risk    60781
Priority Needs     9551
dtype: int64