In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt

In [2]:
df_tabular = pd.read_csv("data/food_crises_cleaned.csv")

In [3]:
df_model = df_tabular[~df_tabular.ipc.isna()][['ipc', 'ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]

# not using 'ha' because of missing values

ipc5 = df_model.loc[df_model[df_model['ipc']==5].index]
df_model.drop(df_model[df_model['ipc']==5].index, axis='index', inplace=True)
df_model.reset_index(inplace=True)
df_model.drop(columns=['index'], inplace=True)


In [4]:
ipc5

Unnamed: 0,ipc,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,et_anom,count_violence,sum_fatalities,food_price_idx,area,cropland_pct,pop,ruggedness_mean,pasture_pct
7547,5.0,0.423214,100.5192,0.082133,-0.003616,0.522037,0.201752,3,32,20.40088,1637.891,41.07024,97790.0,36786.08,86.84211


In [5]:
X = df_model[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]
y = df_model['ipc']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# X_train, X_val, y_train, y_val = train_test_split(X_to_split, y_to_split, test_size=0.25, random_state=42, stratify=y_to_split)

In [8]:
y[y==5]

Series([], Name: ipc, dtype: float64)

In [9]:
X_train = pd.concat([X_train, ipc5[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]])
y_train = pd.concat([y_train, ipc5.ipc])

In [10]:
# X_train.shape, X_val.shape, X_test.shape

In [11]:
# y_train.shape, y_val.shape, y_test.shape

In [12]:
def classification_evaluation(y_test, y_pred, avg="weighted"):

    precision = precision_score(y_test, y_pred, average=avg, zero_division=0)
    recall = recall_score(y_test, y_pred, average=avg, zero_division=0)
    F1_score = f1_score(y_test, y_pred, average=avg, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f'Accuracy: {round(accuracy,3)}')
    print(f'Precision: {round(precision,3)}')
    print(f'Recall: {round(recall,3)}')
    print(f'f1-score: {round(F1_score,3)}')
    print(f'Confusion matrix: \n {confusion_matrix(y_test, y_pred)}')
    

def grid_train_pred_eval(model, grid, X_train, X_test, y_train, y_test, cv=5, best_est=True):
    '''optionally returns the best estimator such that it can be saved in a variable
    '''
    
    grid_search = GridSearchCV(model, grid, cv=cv).fit(X_train, y_train)
    y_pred_train = grid_search.predict(X_train)
    y_pred_test = grid_search.predict(X_test)
    
    print('Metrics on training data:')
    classification_evaluation(y_train, y_pred_train)
    print('\n')
    
    print('Metrics on testing data:')
    classification_evaluation(y_test, y_pred_test)
    
    print(f'Best hyperparameters found: {grid_search.best_params_}')
    
    if best_est == True:
        return grid_search.best_estimator_

# Imputation


In [13]:
# grid={'n_estimators':[50,250,350]}

# forest_clf = grid_train_pred_eval(
#     RandomForestClassifier(bootstrap = True, criterion = 'gini', random_state=42), 
#     grid, X_train, X_test, y_train, y_test)

# forest_clf

# Interpolation


In [14]:
df_model_2 = df_tabular[~df_tabular.ipc.isna()][['district', 'ipc', 'year_month', 'ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]

# not using 'ha' because of missing values

# df_model_2.year_month = df_tabular.year_month.apply(lambda x : x[:4] + '-' + x[5:] + '-01')
# df_model_2.year_month = pd.to_datetime(df_model_2.year_month)

In [15]:
def create_ipc_features(dataframe):

    dataframe.year_month = dataframe.year_month.apply(lambda x : x[:4] + '-' + x[5:] + '-01')
    dataframe.year_month = pd.to_datetime(dataframe.year_month)

    dataframe["prev_ipc"] = dataframe.groupby('district')['ipc'].shift(1)
    dataframe["2prev_ipc"] = dataframe.groupby('district')['ipc'].shift(2)
    dataframe["next_ipc"] = dataframe.groupby('district')['ipc'].shift(-1)
    
   

    return dataframe

In [16]:
df_model_2 = create_ipc_features(df_model_2)
df_model_2 = df_model_2[df_model_2.year_month != '2009-07-01']
df_model_2 = df_model_2[df_model_2.year_month != '2009-10-01']
df_model_2 = df_model_2[df_model_2.year_month != '2020-02-01']

ipc5_2 = df_model_2.loc[df_model_2[df_model_2['ipc']==5].index]
df_model_2.drop(df_model_2[df_model_2['ipc']==5].index, axis='index', inplace=True)
df_model_2.reset_index(inplace=True)
df_model_2.drop(columns=['index'], inplace=True)

In [17]:
ipc5_2

Unnamed: 0,district,ipc,year_month,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,et_anom,count_violence,sum_fatalities,food_price_idx,area,cropland_pct,pop,ruggedness_mean,pasture_pct,prev_ipc,2prev_ipc,next_ipc
7547,Leer,5.0,2017-02-01,0.423214,100.5192,0.082133,-0.003616,0.522037,0.201752,3,32,20.40088,1637.891,41.07024,97790.0,36786.08,86.84211,4.0,4.0,3.0


In [18]:
X_2 = df_model_2[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct',
                 'prev_ipc', '2prev_ipc', 'next_ipc']]
y_2 = df_model_2['ipc']

In [19]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42, stratify=y_2)

In [20]:
y_2[y_2==5]

Series([], Name: ipc, dtype: float64)

In [21]:
X_train_2 = pd.concat([X_train_2, ipc5_2[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct',
                 'prev_ipc', '2prev_ipc', 'next_ipc']]])
y_train_2 = pd.concat([y_train_2, ipc5_2.ipc])

In [125]:
grid={'n_estimators':[250]}

forest_clf_2 = grid_train_pred_eval(
    RandomForestClassifier(bootstrap = True, criterion = 'gini', random_state=42), 
    grid, X_train_2, X_test_2, y_train_2, y_test_2, best_est=True)

forest_clf_2



Metrics on training data:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
f1-score: 1.0
Confusion matrix: 
 [[632   0   0   0   0]
 [  0 836   0   0   0]
 [  0   0 645   0   0]
 [  0   0   0 132   0]
 [  0   0   0   0   1]]


Metrics on testing data:
Accuracy: 0.762
Precision: 0.744
Recall: 0.762
f1-score: 0.743
Confusion matrix: 
 [[128  29   1   0]
 [ 19 171  20   0]
 [  0  31 128   2]
 [  0   0  32   1]]
Best hyperparameters found: {'n_estimators': 250}


# Predictions

In [150]:
df_tabular = pd.read_csv("data/food_crises_cleaned.csv")

In [151]:
df_tabular = df_tabular[['district', 'ipc', 'year_month',
            'ndvi_mean', 'ndvi_anom', 'rain_mean', 'rain_anom', 'et_mean', 'et_anom', 'count_violence', 
            'sum_fatalities', 'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]

In [152]:
df_pred = df_tabular[~df_tabular.ipc.isna()][['district', 'ipc', 'year_month',
            'ndvi_mean', 'ndvi_anom', 'rain_mean', 'rain_anom', 'et_mean', 'et_anom', 'count_violence', 
            'sum_fatalities', 'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]

df_pred = create_ipc_features(df_pred)

df_tabular.year_month = df_tabular.year_month.apply(lambda x : x[:4] + '-' + x[5:] + '-01')
df_tabular.year_month = pd.to_datetime(df_tabular.year_month)

In [153]:
df_tabular.drop(df_tabular[df_tabular.year_month < '2009-07-01'].index, inplace=True)


In [154]:
df_tabular = df_tabular.merge(df_pred, how='outer')

df_tabular[(df_tabular.district=='Bor')].loc[:50]

df_tabular['next_ipc'].fillna(method='ffill', inplace=True)
df_tabular['prev_ipc'].fillna(method='bfill', inplace=True)
df_tabular['2prev_ipc'].fillna(method='bfill', inplace=True)

df_tabular = df_tabular[df_tabular.year_month != '2009-07-01']
df_tabular = df_tabular[df_tabular.year_month > '2009-10-01']
df_tabular = df_tabular[df_tabular.year_month != '2020-02-01']

df_tabular[['district', 'year_month', 'ipc', 'prev_ipc', '2prev_ipc', 'next_ipc']][df_tabular.district=='Bor'].loc[:30]

Unnamed: 0,district,year_month,ipc,prev_ipc,2prev_ipc,next_ipc
4,Bor,2009-11-01,,2.0,2.0,1.0
5,Bor,2009-12-01,,2.0,2.0,1.0
6,Bor,2010-01-01,1.0,2.0,2.0,2.0
7,Bor,2010-02-01,,1.0,2.0,2.0
8,Bor,2010-03-01,,1.0,2.0,2.0
9,Bor,2010-04-01,2.0,1.0,2.0,2.0
10,Bor,2010-05-01,,2.0,1.0,2.0
11,Bor,2010-06-01,,2.0,1.0,2.0
12,Bor,2010-07-01,2.0,2.0,1.0,2.0
13,Bor,2010-08-01,,2.0,2.0,2.0


In [155]:
df_pred = df_tabular[df_tabular.ipc.isna()][['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct',
                 'prev_ipc', '2prev_ipc', 'next_ipc']]

In [156]:
df_pred

Unnamed: 0,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,et_anom,count_violence,sum_fatalities,food_price_idx,area,cropland_pct,pop,ruggedness_mean,pasture_pct,prev_ipc,2prev_ipc,next_ipc
4,0.472371,85.85184,2.314890,-4.467705,7.499793,-5.234680,1,11,1.197838,14008.3300,7.961984,256618.0,11393.760,90.8503,2.0,2.0,1.0
5,0.309300,74.78490,0.930444,0.148704,2.397086,-1.174480,3,22,1.205884,14008.3300,7.961984,256618.0,11393.760,90.8503,2.0,2.0,1.0
7,0.220465,85.87476,1.406630,0.521490,0.584436,-0.002676,0,0,1.262956,14008.3300,7.961984,265263.0,11393.760,90.8503,1.0,2.0,2.0
8,0.218581,86.85130,3.486286,-0.695863,0.709874,-0.883881,0,0,1.293821,14008.3300,7.961984,265263.0,11393.760,90.8503,1.0,2.0,2.0
10,0.338209,79.44981,23.363730,4.377874,10.138690,-1.150318,1,1,1.262869,14008.3300,7.961984,265263.0,11393.760,90.8503,2.0,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9977,0.723373,104.08310,28.682790,1.078028,33.263010,7.267976,0,0,77.543980,757.7855,13.068690,102228.0,7145.909,85.5000,3.0,3.0,3.0
9978,0.732776,105.04350,21.907970,2.402068,30.342330,5.189004,0,0,80.662170,757.7855,13.068690,102228.0,7145.909,85.5000,3.0,3.0,3.0
9980,0.556748,116.75300,0.645686,-0.091687,13.372410,8.745254,0,0,87.728720,757.7855,13.068690,102228.0,7145.909,85.5000,3.0,3.0,4.0
9981,0.400063,117.55830,0.046705,-0.000951,3.089540,1.698060,0,0,93.566750,757.7855,13.068690,102228.0,7145.909,85.5000,3.0,3.0,4.0


In [157]:
preds = forest_clf_2.predict(df_pred)

In [158]:
df_tabular[df_tabular.ipc.isna()].index

Int64Index([   4,    5,    7,    8,   10,   11,   13,   14,   16,   17,
            ...
            9970, 9972, 9973, 9974, 9976, 9977, 9978, 9980, 9981, 9982],
           dtype='int64', length=6786)

In [166]:
preds=pd.DataFrame(preds)
preds.set_index(keys=df_tabular[df_tabular.ipc.isna()].index, inplace=True)
preds.columns = ['pred']

In [181]:
df_tabular['ipc'].fillna(value=pd.Series(preds['pred']), inplace=True)

In [187]:
df_tabular

Unnamed: 0,district,ipc,year_month,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,et_anom,count_violence,sum_fatalities,food_price_idx,area,cropland_pct,pop,ruggedness_mean,pasture_pct,prev_ipc,2prev_ipc,next_ipc
4,Bor,1.0,2009-11-01,0.472371,85.85184,2.314890,-4.467705,7.499793,-5.234680,1,11,1.197838,14008.3300,7.961984,256618.0,11393.760,90.8503,2.0,2.0,1.0
5,Bor,1.0,2009-12-01,0.309300,74.78490,0.930444,0.148704,2.397086,-1.174480,3,22,1.205884,14008.3300,7.961984,256618.0,11393.760,90.8503,2.0,2.0,1.0
6,Bor,1.0,2010-01-01,0.253501,83.79816,0.293134,-0.068916,0.167976,-0.454393,0,0,1.238529,14008.3300,7.961984,265263.0,11393.760,90.8503,2.0,2.0,2.0
7,Bor,1.0,2010-02-01,0.220465,85.87476,1.406630,0.521490,0.584436,-0.002676,0,0,1.262956,14008.3300,7.961984,265263.0,11393.760,90.8503,1.0,2.0,2.0
8,Bor,1.0,2010-03-01,0.218581,86.85130,3.486286,-0.695863,0.709874,-0.883881,0,0,1.293821,14008.3300,7.961984,265263.0,11393.760,90.8503,1.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9978,Malakal,3.0,2019-09-01,0.732776,105.04350,21.907970,2.402068,30.342330,5.189004,0,0,80.662170,757.7855,13.068690,102228.0,7145.909,85.5000,3.0,3.0,3.0
9979,Malakal,3.0,2019-10-01,0.681990,113.37050,14.913800,2.580618,22.853360,6.053181,1,0,82.872570,757.7855,13.068690,102228.0,7145.909,85.5000,3.0,3.0,4.0
9980,Malakal,3.0,2019-11-01,0.556748,116.75300,0.645686,-0.091687,13.372410,8.745254,0,0,87.728720,757.7855,13.068690,102228.0,7145.909,85.5000,3.0,3.0,4.0
9981,Malakal,3.0,2019-12-01,0.400063,117.55830,0.046705,-0.000951,3.089540,1.698060,0,0,93.566750,757.7855,13.068690,102228.0,7145.909,85.5000,3.0,3.0,4.0


In [188]:
df_tabular.to_csv('food_crises_interpol')

# Plot IPC progression