Using kernel `conda_pytorch_latest_p36`

# Import

In [1]:
# !pip install transformers
# !pip install datasets

In [2]:
import sys
sys.path.append('../../../')

In [3]:
from pathlib import Path
import os
import random
import pickle

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from sklearn.metrics import classification_report, precision_recall_fscore_support, precision_score, f1_score, recall_score
from sklearn.preprocessing import MultiLabelBinarizer
import torch

In [5]:
from deep.constants import *

# Data

In [32]:
def preprocessing(df):
    df = df.copy()
    df['sectors'] = df['sectors'].apply(eval)    
    df['pillars'] = df['pillars'].apply(eval)
    df['pillars'] = df['pillars'].apply(lambda x: list(set(x)))
    df['subpillars'] = df['subpillars'].apply(eval)
    return df

In [34]:
train = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.3_train.csv'))
val = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.3_val.csv'))
test = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.3_test.csv'))

def process(df):
    df = df.copy()
    
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(list(df['sectors']))
    df['labels'] = list(labels)
    
    df = df[['excerpt', 'labels']]
    df = df.rename(columns={'excerpt': 'texts'})
        
    return df

train_df = process(train)
val_df = process(val)
test_df = process(test)

In [44]:
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(list(train['sectors']))

In [45]:
mlb.classes_

array(['Agriculture', 'Cross', 'Education', 'Food Security', 'Health',
       'Livelihoods', 'Logistics', 'Nutrition', 'Protection', 'Shelter',
       'WASH'], dtype=object)

In [35]:
train_df

Unnamed: 0,texts,labels
0,Market monitoring by the World Food Programme ...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
1,Quarantine Facilities: ninety-three shelters i...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
2,"Within dimensions, markets are broadly operati...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
3,Frontline aid workers face a heightened risk o...,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0]"
4,[COVID] TRC is currently using its different c...,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
...,...,...
90648,"[16th Mar 2021,North east Nigeria]The governme...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
90649,"[16th Mar 2021,North east Nigeria] Impact on s...","[0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0]"
90650,A reported 14 per cent of women aged 15-49 had...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"
90651,La alternancia no es solo plantear cuáles niño...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]"


In [9]:
def get_freq_matrix(df):
    t = df.explode('pillars').explode('subpillars')
    t = t.groupby(['pillars', 'subpillars']).count()[['sectors']]
    freq = t.unstack(level=0)['sectors'].fillna(0).astype(int)
    return freq

In [10]:
freq = get_freq_matrix(train)
freq_val = get_freq_matrix(val)
freq_test = get_freq_matrix(test)

In [14]:
t = train[train.pillars.apply(len) == 1]

In [11]:
freq

pillars,Capacities & Response,Humanitarian Conditions,Impact,People At Risk,Priority Interventions,Priority Needs
subpillars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Capacities & Response->International Response,5708,335,285,54,18,19
Capacities & Response->National Response,7627,1429,673,133,22,73
Capacities & Response->Number Of People Reached,2258,129,75,9,8,5
Capacities & Response->Response Gaps,67,38,26,7,0,0
Humanitarian Conditions->Coping Mechanisms,69,4132,990,473,11,76
Humanitarian Conditions->Living Standards,1188,19903,5941,2281,84,351
Humanitarian Conditions->Number Of People In Need,111,2982,621,325,12,75
Humanitarian Conditions->Physical And Mental Well Being,563,11952,3061,1404,27,122
Impact->Driver/Aggravating Factors,396,4792,12138,1376,46,133
Impact->Impact On People,78,1684,7788,526,51,105


In [19]:
t = train[train.pillars.apply(len)==0]

In [21]:
t

Unnamed: 0,entry_id,lead_id,project_id,project_title,analysis_framework_id,excerpt,dropped_excerpt,created_by_id,modified_by_id,verified,verification_last_changed_by_id,sectors,pillars,subpillars
4,170866,37673,1142,IFRC Turkey,699,[COVID] TRC is currently using its different c...,,2233,2233,False,,[Health],[],[]
10,163644,32462,1183,UNHCR Colombia,829,In the framework of the anti-xenophobia campai...,,2374,2374,False,,[Protection],[],[]
11,163645,32462,1183,UNHCR Colombia,829,Information on helplines was disseminated to 2...,,2374,2374,False,,[Cross],[],[]
22,166788,39480,1142,IFRC Turkey,699,The Turkish Health Ministry on Monday reported...,,2231,2231,False,,[Health],[],[]
26,167578,40234,1142,IFRC Turkey,699,"With the 67 new fatalities, the death toll fro...","With the 67 new fatalities, the death toll fro...",26,26,False,,[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90644,286746,50860,2466,GIMAC Sudan,1465,Strengthen poster distribution and RCCE effort...,,2741,2743,True,2743.0,[Cross],[],[]
90645,248726,48228,2332,GIMAC Chad,1465,Recommandations : Promouvoir de l’éducation d’...,Promouvoir de l’éducation d’urgence dans les s...,2720,2446,True,488.0,[Education],[],[]
90646,275475,50599,2334,GIMAC Cameroon,1465,During the reporting period [1 to 15 November ...,,2854,2720,True,488.0,[],[],[]
90650,268927,49888,2331,GIMAC Somalia,1465,A reported 14 per cent of women aged 15-49 had...,,2741,2272,True,488.0,[Protection],[],[]


In [17]:
matching = pd.read_excel('/Users/stefano/Downloads/Pillars and Subpillars Matching.xlsx', sheet_name='Matching - duplicate')
matching = matching[['Final Pillar Name', 'Final Sub-pillar Name']]
matching.columns = ['pillars', 'subpillars']
matching['ones'] = 1
matching.pillars = matching.pillars.str.replace('Humanitatian conditions', 'Humanitarian Conditions')
matching.pillars = matching.pillars.str.replace('impact', 'Impact')
matching_freq = matching.groupby(['pillars', 'subpillars']).sum().unstack(level=0)['ones']

In [15]:
get_freq_matrix(t)

pillars,Capacities & Response,Humanitarian Conditions,Impact,People At Risk,Priority Interventions,Priority Needs
subpillars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Capacities & Response->International Response,1983,0,0,0,0,0
Capacities & Response->National Response,2345,0,0,0,0,0
Capacities & Response->Number Of People Reached,775,0,0,0,0,0
Capacities & Response->Response Gaps,3,0,0,0,0,0
Humanitarian Conditions->Coping Mechanisms,0,720,0,0,0,0
Humanitarian Conditions->Living Standards,0,7603,0,0,0,0
Humanitarian Conditions->Number Of People In Need,0,640,0,0,0,0
Humanitarian Conditions->Physical And Mental Well Being,0,4463,0,0,0,0
Impact->Driver/Aggravating Factors,0,0,1790,0,0,0
Impact->Impact On People,0,0,1778,0,0,0


In [18]:
matching_freq

pillars,Capacities & Response,Humanitarian Conditions,Impact,People at risk,Priority interventions,Priority needs
subpillars,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Coping mechanisms,,9.0,,,,
Driver/aggravating factors,,,10.0,,,
Expressed by humanitarian staff,,,,,2.0,3.0
Expressed by population,,,,,2.0,5.0
Impact on people,,,1.0,,,
Impact on people or impact on services,,,13.0,,,
Impact on services,,,14.0,,,
Impact on systems and services,,,3.0,,,
International response,10.0,,,,,
Living standards,,10.0,,,,


In [None]:
left = ['Impact','People at risk','Priority needs']
removed = ['Capacities & Response','Humanitarian Conditions','Priority interventions']

In [None]:
list(matching_freq[removed].dropna(axis=0, how='all').index)

In [None]:
matching_freq[left].dropna(axis=0, how='all')

In [None]:
get_freq_matrix(t)

In [None]:
freq

In [None]:
freq_val

In [None]:
freq_test

In [None]:
freq.sum(axis=0)