In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt

In [2]:
df_tabular = pd.read_csv("data/food_crises_cleaned.csv")

In [3]:
df_model = df_tabular[~df_tabular.ipc.isna()][['ipc', 'ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]

# not using 'ha' because of missing values

ipc5 = df_model.loc[df_model[df_model['ipc']==5].index]
df_model.drop(df_model[df_model['ipc']==5].index, axis='index', inplace=True)
df_model.reset_index(inplace=True)
df_model.drop(columns=['index'], inplace=True)


In [4]:
ipc5

Unnamed: 0,ipc,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,et_anom,count_violence,sum_fatalities,food_price_idx,area,cropland_pct,pop,ruggedness_mean,pasture_pct
7547,5.0,0.423214,100.5192,0.082133,-0.003616,0.522037,0.201752,3,32,20.40088,1637.891,41.07024,97790.0,36786.08,86.84211


In [5]:
X = df_model[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]
y = df_model['ipc']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# X_train, X_val, y_train, y_val = train_test_split(X_to_split, y_to_split, test_size=0.25, random_state=42, stratify=y_to_split)

In [8]:
y[y==5]

Series([], Name: ipc, dtype: float64)

In [9]:
X_train = pd.concat([X_train, ipc5[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]])
y_train = pd.concat([y_train, ipc5.ipc])

In [10]:
# X_train.shape, X_val.shape, X_test.shape

In [11]:
# y_train.shape, y_val.shape, y_test.shape

In [12]:
def classification_evaluation(y_test, y_pred, avg="weighted"):

    precision = precision_score(y_test, y_pred, average=avg, zero_division=0)
    recall = recall_score(y_test, y_pred, average=avg, zero_division=0)
    F1_score = f1_score(y_test, y_pred, average=avg, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f'Accuracy: {round(accuracy,3)}')
    print(f'Precision: {round(precision,3)}')
    print(f'Recall: {round(recall,3)}')
    print(f'f1-score: {round(F1_score,3)}')
    print(f'Confusion matrix: {confusion_matrix(y_test, y_pred)}')
    

def grid_train_pred_eval(model, grid, X_train, X_test, y_train, y_test, cv=5, best_est=True):
    '''optionally returns the best estimator such that it can be saved in a variable
    '''
    
    grid_search = GridSearchCV(model, grid, cv=cv).fit(X_train, y_train)
    y_pred_train = grid_search.predict(X_train)
    y_pred_test = grid_search.predict(X_test)
    
    print('Metrics on training data:')
    classification_evaluation(y_train, y_pred_train)
    print('\n')
    
    print('Metrics on testing data:')
    classification_evaluation(y_test, y_pred_test)
    
    print(f'Best hyperparameters found: {grid_search.best_params_}')
    
    if best_est == True:
        return grid_search.best_estimator_

# Imputation


In [13]:
grid={'n_estimators':[50,250,350]}

forest_clf = grid_train_pred_eval(
    RandomForestClassifier(bootstrap = True, criterion = 'gini', random_state=42), 
    grid, X_train, X_test, y_train, y_test)

forest_clf



Metrics on training data:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
f1-score: 1.0
Confusion matrix: [[696   0   0   0   0]
 [  0 886   0   0   0]
 [  0   0 703   0   0]
 [  0   0   0 147   0]
 [  0   0   0   0   1]]


Metrics on testing data:
Accuracy: 0.677
Precision: 0.672
Recall: 0.677
f1-score: 0.663
Confusion matrix: [[126  48   0   0]
 [ 27 168  27   0]
 [  1  56 116   3]
 [  0   1  34   2]]
Best hyperparameters found: {'n_estimators': 250}


# Interpolation


In [148]:
df_model_2 = df_tabular[~df_tabular.ipc.isna()][['district', 'ipc', 'year_month', 'ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]

# not using 'ha' because of missing values

df_model_2.year_month = df_tabular.year_month.apply(lambda x : x[:4] + '-' + x[5:] + '-01')
df_model_2.year_month = pd.to_datetime(df_model_2.year_month)

In [149]:
# add 1st and 2nd prev and next ipc 
# add nr months away from 1st and 2nd prev and next ipc
# split in X and y
# add ipc5_2 to train

In [150]:
df_model_2=df_model_2.copy()

df_model_2["prev_ipc"] = df_model_2.groupby('district')['ipc'].shift(1)
df_model_2["2prev_ipc"] = df_model_2.groupby('district')['ipc'].shift(2)
df_model_2["next_ipc"] = df_model_2.groupby('district')['ipc'].shift(-1)


def calculate_time_difference(dataframe, column):
    return dataframe.apply(lambda row: abs(row['year_month'] - row[column]), axis='columns')
    
# df_model_2["prev_date"] = df_model_2.groupby('district')['year_month'].shift(1)
# df_model_2["next_date"] = df_model_2.groupby('district')['year_month'].shift(-1)

df_model_2 = df_model_2[df_model_2.year_month != '2009-07-01']
df_model_2 = df_model_2[df_model_2.year_month != '2009-10-01']
df_model_2 = df_model_2[df_model_2.year_month != '2020-02-01']

# df_model_2["next_date"] = calculate_time_difference(df_model_2, 'next_date')
# df_model_2["prev_date"] = calculate_time_difference(df_model_2, 'prev_date')

In [151]:
ipc5_2 = df_model_2.loc[df_model_2[df_model_2['ipc']==5].index]
df_model_2.drop(df_model_2[df_model_2['ipc']==5].index, axis='index', inplace=True)
df_model_2.reset_index(inplace=True)
df_model_2.drop(columns=['index'], inplace=True)

In [152]:
ipc5_2

Unnamed: 0,district,ipc,year_month,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,et_anom,count_violence,sum_fatalities,food_price_idx,area,cropland_pct,pop,ruggedness_mean,pasture_pct,prev_ipc,2prev_ipc,next_ipc
7547,Leer,5.0,2017-02-01,0.423214,100.5192,0.082133,-0.003616,0.522037,0.201752,3,32,20.40088,1637.891,41.07024,97790.0,36786.08,86.84211,4.0,4.0,3.0


In [168]:
X_2 = df_model_2[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct',
                 'prev_ipc', '2prev_ipc', 'next_ipc']]
y_2 = df_model_2['ipc']

In [169]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42, stratify=y_2)

In [170]:
y_2[y_2==5]

Series([], Name: ipc, dtype: float64)

In [171]:
X_train_2 = pd.concat([X_train_2, ipc5_2[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct',
                 'prev_ipc', '2prev_ipc', 'next_ipc']]])
y_train_2 = pd.concat([y_train_2, ipc5_2.ipc])

In [173]:
grid={'n_estimators':[250]}

forest_clf_2 = grid_train_pred_eval(
    RandomForestClassifier(bootstrap = True, criterion = 'gini', random_state=42), 
    grid, X_train_2, X_test_2, y_train_2, y_test_2)

forest_clf_2



Metrics on training data:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
f1-score: 1.0
Confusion matrix: [[632   0   0   0   0]
 [  0 836   0   0   0]
 [  0   0 645   0   0]
 [  0   0   0 132   0]
 [  0   0   0   0   1]]


Metrics on testing data:
Accuracy: 0.762
Precision: 0.744
Recall: 0.762
f1-score: 0.743
Confusion matrix: [[128  29   1   0]
 [ 19 171  20   0]
 [  0  31 128   2]
 [  0   0  32   1]]
Best hyperparameters found: {'n_estimators': 250}


In [174]:
X_train_2

Unnamed: 0,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,et_anom,count_violence,sum_fatalities,food_price_idx,area,cropland_pct,pop,ruggedness_mean,pasture_pct,prev_ipc,2prev_ipc,next_ipc
2355,0.269428,96.39127,0.055701,-0.000907,0.289682,-0.145043,0,0,2.160001,11866.220,30.162330,63801.0,20155.890,87.21831,2.0,2.0,3.0
1940,0.292380,92.27671,0.052588,-0.005515,0.115323,0.033895,0,0,53.189380,4428.075,10.101880,162946.0,5726.711,82.94444,3.0,4.0,4.0
689,0.303265,81.12310,8.001044,-3.758007,10.158230,1.573021,0,0,1.308612,5758.278,56.159650,192657.0,13915.230,73.75362,2.0,2.0,2.0
1331,0.670094,110.24350,13.339970,-1.595913,26.790160,9.190472,1,4,73.675010,4883.326,4.964257,389333.0,17832.140,86.46552,4.0,4.0,3.0
1629,0.266948,84.42636,4.623626,-2.259570,1.830199,-0.812974,0,0,1.887479,4844.418,4.428182,108004.0,3591.286,89.58182,1.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1549,0.238967,80.72475,5.522245,-2.894917,0.282223,-1.803714,0,0,1.063881,4440.182,18.189620,132915.0,11936.300,89.38462,1.0,3.0,2.0
1562,0.647819,95.63124,26.185120,-4.974002,20.864980,-2.795710,0,0,2.096466,4440.182,18.189620,132961.0,11936.300,89.38462,2.0,2.0,2.0
943,0.693635,106.22770,28.544720,-0.195854,24.294190,2.267097,0,0,1.999169,9237.169,15.375390,12906.0,9649.806,65.10092,1.0,1.0,1.0
1802,0.679276,107.88390,14.921400,2.323688,23.074250,5.546668,1,0,1.123413,13496.320,27.965010,167586.0,25183.090,90.86875,1.0,1.0,1.0
