Using kernel `conda_pytorch_latest_p36`

# Import

In [1]:
# !pip install transformers
# !pip install datasets

In [2]:
import sys
sys.path.append('../../../')

In [3]:
from pathlib import Path
import os
import random
import pickle

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from sklearn.metrics import classification_report, precision_recall_fscore_support, precision_score, f1_score, recall_score
from sklearn.preprocessing import MultiLabelBinarizer
import torch

In [5]:
from deep.constants import *

# Data

In [6]:
def preprocessing(df):
    df = df.copy()
    df['sectors'] = df['sectors'].apply(eval)    
    df['pillars'] = df['pillars'].apply(eval)
    df['pillars'] = df['pillars'].apply(lambda x: list(set(x)))
    df['subpillars'] = df['subpillars'].apply(eval)
    return df

In [7]:
train = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.3_train.csv'))
val = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.3_val.csv'))
test = preprocessing(pd.read_csv(LATEST_DATA_PATH / 'data_v0.4.3_test.csv'))

def process(df, column, classes):
    df = df.copy()
    
    mlb = MultiLabelBinarizer(classes=classes)
    labels = mlb.fit_transform(list(df[column]))
    df['labels'] = list(labels)
    
    df = df[['excerpt', 'labels']]
    df = df.rename(columns={'excerpt': 'texts'})
        
    return df

train_df = process(train, 'sectors', classes=SECTORS)
val_df = process(val, 'sectors', classes=SECTORS)
test_df = process(test, 'sectors', classes=SECTORS)

## Visualize

In [9]:
PILLARS

['Humanitarian Conditions',
 'Capacities & Response',
 'Impact',
 'Priority Interventions',
 'People At Risk',
 'Priority Needs']

In [14]:
SUBPILLARS

['Capacities & Response->International Response',
 'Capacities & Response->National Response',
 'Capacities & Response->Number Of People Reached',
 'Capacities & Response->Response Gaps',
 'Humanitarian Conditions->Coping Mechanisms',
 'Humanitarian Conditions->Living Standards',
 'Humanitarian Conditions->Number Of People In Need',
 'Humanitarian Conditions->Physical And Mental Well Being',
 'Impact->Driver/Aggravating Factors',
 'Impact->Impact On People',
 'Impact->Impact On People Or Impact On Services',
 'Impact->Impact On Services',
 'Impact->Impact On Systems And Services',
 'Impact->Number Of People Affected',
 'People At Risk->Number Of People At Risk',
 'People At Risk->Risk And Vulnerabilities',
 'Priority Interventions->Expressed By Humanitarian Staff',
 'Priority Interventions->Expressed By Population',
 'Priority Needs->Expressed By Humanitarian Staff',
 'Priority Needs->Expressed By Population']

In [None]:
df = train.iloc[100:]
for x, y, z, q in zip(df.excerpt, df.pillars, df.subpillars, df.sectors):
    print(x)
    print(y)
    print(z)
    print(q)
    a = input()
    if a == 's':
        break

The reporting month of June, saw movement of 8,581 persons of which 6,680 were arrivals and 1,901 were departures. Preventive measures against the COVID-19 pandemic is a top priority particularly focusing on the highly congested camps to promote physical and safe distancing, and to help improve on camp planning and decongesting the camps. This will also reduce the impact of natural or man-made disasters such as fire outbreaks and floods.
[]
[]
[]

Under the ‘SDC-BRAC social cohesion fund for Cox’s Bazar district (COVID-19)’, HCMP provided BDT 1,500 to each of the selected 7,047 people from the host community. A total of 316 persons with disabilities were also brought under this support. The emergency aid aims to help low-income families who suffered most due to the pandemic.
['Capacities & Response']
['Capacities & Response->Number Of People Reached']
['Livelihoods']

With no sign of the curve flattening in the near future, further support is required to continue this battle against CO


Over 24,150 caregivers were sensitized with messages on COVID-19 by UNICEF VCMs in 20 LGAs in Borno State. Messages focus on voluntary sample testing, timely reporting of cases and de-stigmatization.
['Capacities & Response']
['Capacities & Response->Number Of People Reached']
['Health']

In Borno State, MMC has 74.8% of total confirmed cases while Jere LGA has 17.5%. A total of 219 contacts are being closely monitored in Borno State and 1,857 Households with 11,442 persons were reached. In Yobe State, case fatality rate amongst confirmed cases is 11.9% and positivity rate of 13.7% of the 490 samples tested. Case to contact ratio is 1:10 and percentage of LGAs that reported at least one confirmed case in the state is 76.1%. The percentage of confirmed cases amongst healthcare workers is 20%. In Adamawa State, a total of 17 contacts are being monitored. 217 out of 1,633 samples tested are positive, giving a positivity rate of 13.3%. Also, 48.4% of confirmed cases are known contacts of 

In [8]:
train

Unnamed: 0,entry_id,lead_id,project_id,project_title,analysis_framework_id,excerpt,dropped_excerpt,created_by_id,modified_by_id,verified,verification_last_changed_by_id,sectors,pillars,subpillars
0,163664,35315,2028,IMMAP/DFS Syria,1306,Market monitoring by the World Food Programme ...,,2232,2232,False,,[Food Security],[Impact],[Impact->Impact On Systems And Services]
1,162812,37820,2098,IMMAP/DFS Bangladesh,1306,Quarantine Facilities: ninety-three shelters i...,,657,2233,False,,[Health],[Capacities & Response],[Capacities & Response->International Response]
2,164560,39796,2098,IMMAP/DFS Bangladesh,1306,"Within dimensions, markets are broadly operati...",,1152,1152,False,,[Cross],[Impact],[Impact->Impact On Systems And Services]
3,157496,38706,2098,IMMAP/DFS Bangladesh,1306,Frontline aid workers face a heightened risk o...,,2233,2233,False,,"[Health, Logistics]","[Impact, People At Risk]","[Impact->Driver/Aggravating Factors, People At..."
4,170866,37673,1142,IFRC Turkey,699,[COVID] TRC is currently using its different c...,,2233,2233,False,,[Health],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90648,282949,51241,2170,IMMAP/DFS Nigeria,1306,"[16th Mar 2021,North east Nigeria]The governme...",,2230,26,True,26.0,[Cross],[Impact],[Impact->Impact On Systems And Services]
90649,283375,51241,2170,IMMAP/DFS Nigeria,1306,"[16th Mar 2021,North east Nigeria] Impact on s...",,2230,1152,True,1152.0,"[Health, Education, Protection, Livelihoods]","[Humanitarian Conditions, Impact]","[Impact->Driver/Aggravating Factors, Humanitar..."
90650,268927,49888,2331,GIMAC Somalia,1465,A reported 14 per cent of women aged 15-49 had...,,2741,2272,True,488.0,[Protection],[],[]
90651,268842,49945,2311,IMMAP/DFS Colombia,1306,La alternancia no es solo plantear cuáles niño...,La alternancia no es solo plantear cuáles niño...,2374,2374,True,1403.0,[Education],[Impact],[Impact->Impact On Systems And Services]


## Freq

In [None]:
def get_freq_matrix(df):
    t = df.explode('pillars').explode('subpillars')
    t = t.groupby(['pillars', 'subpillars']).count()[['sectors']]
    freq = t.unstack(level=0)['sectors'].fillna(0).astype(int)
    return freq

In [None]:
freq = get_freq_matrix(train)
freq_val = get_freq_matrix(val)
freq_test = get_freq_matrix(test)

In [None]:
t = train[train.pillars.apply(len) == 1]

In [None]:
freq

In [None]:
t = train[train.pillars.apply(len)==0]

In [None]:
t

In [None]:
matching = pd.read_excel('/Users/stefano/Downloads/Pillars and Subpillars Matching.xlsx', sheet_name='Matching - duplicate')
matching = matching[['Final Pillar Name', 'Final Sub-pillar Name']]
matching.columns = ['pillars', 'subpillars']
matching['ones'] = 1
matching.pillars = matching.pillars.str.replace('Humanitatian conditions', 'Humanitarian Conditions')
matching.pillars = matching.pillars.str.replace('impact', 'Impact')
matching_freq = matching.groupby(['pillars', 'subpillars']).sum().unstack(level=0)['ones']

In [None]:
get_freq_matrix(t)

In [None]:
matching_freq

In [None]:
left = ['Impact','People at risk','Priority needs']
removed = ['Capacities & Response','Humanitarian Conditions','Priority interventions']

In [None]:
list(matching_freq[removed].dropna(axis=0, how='all').index)

In [None]:
matching_freq[left].dropna(axis=0, how='all')

In [None]:
get_freq_matrix(t)

In [None]:
freq

In [None]:
freq_val

In [None]:
freq_test

In [None]:
freq.sum(axis=0)