In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

import matplotlib.pyplot as plt

In [2]:
df_tabular = pd.read_csv("geodata/results/food_crises_with_counties_regions.csv")

In [3]:
df_tabular.columns

Index(['country', 'district_code', 'district', 'centx', 'centy', 'county',
       'region', 'year_month', 'year', 'month', 'ipc', 'ha', 'ndvi_mean',
       'ndvi_anom', 'rain_mean', 'rain_anom', 'et_mean', 'et_anom',
       'count_violence', 'sum_fatalities', 'food_price_idx', 'area',
       'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct', 'date'],
      dtype='object')

In [4]:
df_model = df_tabular[~df_tabular.ipc.isna()][['ipc', 'ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]

# not using 'ha' because of missing values

ipc5 = df_model.loc[df_model[df_model['ipc']==5].index]
df_model.drop(df_model[df_model['ipc']==5].index, axis='index', inplace=True)
df_model.reset_index(inplace=True)
df_model.drop(columns=['index'], inplace=True)


In [5]:
ipc5

Unnamed: 0,ipc,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,et_anom,count_violence,sum_fatalities,food_price_idx,area,cropland_pct,pop,ruggedness_mean,pasture_pct
7547,5.0,0.423214,100.5192,0.082133,-0.003616,0.522037,0.201752,3,32,20.40088,1637.891,41.07024,97790.0,36786.08,86.84211


In [6]:
X = df_model[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]
y = df_model['ipc']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
# X_train, X_val, y_train, y_val = train_test_split(X_to_split, y_to_split, test_size=0.25, random_state=42, stratify=y_to_split)

In [9]:
y[y==5]

Series([], Name: ipc, dtype: float64)

In [10]:
X_train = pd.concat([X_train, ipc5[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]])
y_train = pd.concat([y_train, ipc5.ipc])

In [11]:
# X_train.shape, X_val.shape, X_test.shape

In [12]:
# y_train.shape, y_val.shape, y_test.shape

In [13]:
def classification_evaluation(y_test, y_pred, avg="weighted"):

    precision = precision_score(y_test, y_pred, average=avg, zero_division=0)
    recall = recall_score(y_test, y_pred, average=avg, zero_division=0)
    F1_score = f1_score(y_test, y_pred, average=avg, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f'Accuracy: {round(accuracy,3)}')
    print(f'Precision: {round(precision,3)}')
    print(f'Recall: {round(recall,3)}')
    print(f'f1-score: {round(F1_score,3)}')
    print(f'Confusion matrix: \n {confusion_matrix(y_test, y_pred)}')
    

def grid_train_pred_eval(model, grid, X_train, X_test, y_train, y_test, cv=5, best_est=True):
    '''optionally returns the best estimator such that it can be saved in a variable
    '''
    
    grid_search = GridSearchCV(model, grid, cv=cv).fit(X_train, y_train)
    y_pred_train = grid_search.predict(X_train)
    y_pred_test = grid_search.predict(X_test)
    
    print('Metrics on training data:')
    classification_evaluation(y_train, y_pred_train)
    print('\n')
    
    print('Metrics on testing data:')
    classification_evaluation(y_test, y_pred_test)
    
    print(f'Best hyperparameters found: {grid_search.best_params_}')
    
    if best_est == True:
        return grid_search.best_estimator_

# Imputation


In [14]:
# grid={'n_estimators':[50,250,350]}

# forest_clf = grid_train_pred_eval(
#     RandomForestClassifier(bootstrap = True, criterion = 'gini', random_state=42), 
#     grid, X_train, X_test, y_train, y_test)

# forest_clf

# Interpolation


In [15]:
df_model_2 = df_tabular[~df_tabular.ipc.isna()][['district', 'ipc', 'year_month', 'ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]

# not using 'ha' because of missing values

# df_model_2.year_month = df_tabular.year_month.apply(lambda x : x[:4] + '-' + x[5:] + '-01')
# df_model_2.year_month = pd.to_datetime(df_model_2.year_month)

In [1]:
def create_ipc_features(dataframe):

    dataframe.year_month = dataframe.year_month.apply(lambda x : x[:4] + '-' + x[5:] + '-01')
    dataframe.year_month = pd.to_datetime(dataframe.year_month)

    dataframe["prev_ipc"] = dataframe.groupby('district')['ipc'].shift(1)
    dataframe["2prev_ipc"] = dataframe.groupby('district')['ipc'].shift(2)
    dataframe["next_ipc"] = dataframe.groupby('district')['ipc'].shift(-1)
    
    return dataframe

In [17]:
df_model_2 = create_ipc_features(df_model_2)
df_model_2 = df_model_2[df_model_2.year_month != '2009-07-01']
df_model_2 = df_model_2[df_model_2.year_month != '2009-10-01']
df_model_2 = df_model_2[df_model_2.year_month != '2020-02-01']

ipc5_2 = df_model_2.loc[df_model_2[df_model_2['ipc']==5].index]
df_model_2.drop(df_model_2[df_model_2['ipc']==5].index, axis='index', inplace=True)
df_model_2.reset_index(inplace=True)
df_model_2.drop(columns=['index'], inplace=True)

In [18]:
ipc5_2

Unnamed: 0,district,ipc,year_month,ndvi_mean,ndvi_anom,rain_mean,rain_anom,et_mean,et_anom,count_violence,sum_fatalities,food_price_idx,area,cropland_pct,pop,ruggedness_mean,pasture_pct,prev_ipc,2prev_ipc,next_ipc
7547,Leer,5.0,2017-02-01,0.423214,100.5192,0.082133,-0.003616,0.522037,0.201752,3,32,20.40088,1637.891,41.07024,97790.0,36786.08,86.84211,4.0,4.0,3.0


In [19]:
X_2 = df_model_2[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct',
                 'prev_ipc', '2prev_ipc', 'next_ipc']]
y_2 = df_model_2['ipc']

In [20]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42, stratify=y_2)

In [21]:
y_2[y_2==5]

Series([], Name: ipc, dtype: float64)

In [22]:
X_train_2 = pd.concat([X_train_2, ipc5_2[['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct',
                 'prev_ipc', '2prev_ipc', 'next_ipc']]])
y_train_2 = pd.concat([y_train_2, ipc5_2.ipc])

In [23]:
grid={'n_estimators':[250]}

forest_clf_2 = grid_train_pred_eval(
    RandomForestClassifier(bootstrap = True, criterion = 'gini', random_state=42), 
    grid, X_train_2, X_test_2, y_train_2, y_test_2, best_est=True)

forest_clf_2



Metrics on training data:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
f1-score: 1.0
Confusion matrix: 
 [[621   0   0   0   0]
 [  0 832   0   0   0]
 [  0   0 635   0   0]
 [  0   0   0 128   0]
 [  0   0   0   0   1]]


Metrics on testing data:
Accuracy: 0.76
Precision: 0.725
Recall: 0.76
f1-score: 0.74
Confusion matrix: 
 [[123  33   0   0]
 [ 17 170  21   0]
 [  0  29 129   1]
 [  0   0  32   0]]
Best hyperparameters found: {'n_estimators': 250}


# Predictions

In [24]:
df_tabular = pd.read_csv("geodata/results/food_crises_with_counties_regions.csv")

In [25]:
df_tabular = df_tabular[['district', 'county', 'region', 'ipc', 'year_month',
            'ndvi_mean', 'ndvi_anom', 'rain_mean', 'rain_anom', 'et_mean', 'et_anom', 'count_violence', 
            'sum_fatalities', 'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]

In [26]:
df_pred = df_tabular[~df_tabular.ipc.isna()][['district', 'ipc', 'year_month',
            'ndvi_mean', 'ndvi_anom', 'rain_mean', 'rain_anom', 'et_mean', 'et_anom', 'count_violence', 
            'sum_fatalities', 'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct']]

df_pred = create_ipc_features(df_pred)

df_tabular.year_month = df_tabular.year_month.apply(lambda x : x[:4] + '-' + x[5:] + '-01')
df_tabular.year_month = pd.to_datetime(df_tabular.year_month)
df_tabular.drop(df_tabular[df_tabular.year_month < '2009-07-01'].index, inplace=True)


In [27]:
df_tabular[df_tabular.district=='Malakal'][['district', 'year_month','ipc']]

Unnamed: 0,district,year_month,ipc
12038,Malakal,2009-07-01,2.0
12039,Malakal,2009-08-01,
12040,Malakal,2009-09-01,
12041,Malakal,2009-10-01,1.0
12042,Malakal,2009-11-01,
...,...,...,...
12161,Malakal,2019-10-01,3.0
12162,Malakal,2019-11-01,
12163,Malakal,2019-12-01,
12164,Malakal,2020-01-01,


In [28]:
df_pred[df_pred.district=='Malakal'][['district', 'year_month','ipc' ,'prev_ipc','2prev_ipc','next_ipc']]

Unnamed: 0,district,year_month,ipc,prev_ipc,2prev_ipc,next_ipc
12038,Malakal,2009-07-01,2.0,,,1.0
12041,Malakal,2009-10-01,1.0,2.0,,1.0
12044,Malakal,2010-01-01,1.0,1.0,2.0,1.0
12047,Malakal,2010-04-01,1.0,1.0,1.0,1.0
12050,Malakal,2010-07-01,1.0,1.0,1.0,2.0
12053,Malakal,2010-10-01,2.0,1.0,1.0,2.0
12056,Malakal,2011-01-01,2.0,2.0,1.0,2.0
12059,Malakal,2011-04-01,2.0,2.0,2.0,2.0
12062,Malakal,2011-07-01,2.0,2.0,2.0,1.0
12065,Malakal,2011-10-01,1.0,2.0,2.0,1.0


In [29]:
df_tabular = df_tabular.merge(df_pred, how='outer')

df_tabular['next_ipc'].fillna(method='ffill', inplace=True)
df_tabular['prev_ipc'].fillna(method='bfill', inplace=True)
df_tabular['2prev_ipc'].fillna(method='bfill', inplace=True)

df_tabular = df_tabular[df_tabular.year_month > '2009-10-01']
df_tabular = df_tabular[df_tabular.year_month != '2020-02-01']

In [30]:
df_tabular[df_tabular.district=='Malakal'][['district', 'year_month', 'ipc', 'prev_ipc', '2prev_ipc', 'next_ipc']].loc[9780:9800]

Unnamed: 0,district,year_month,ipc,prev_ipc,2prev_ipc,next_ipc
9780,Malakal,2013-11-01,,1.0,2.0,2.0
9781,Malakal,2013-12-01,,1.0,2.0,2.0
9782,Malakal,2014-01-01,2.0,1.0,2.0,3.0
9783,Malakal,2014-02-01,,2.0,1.0,3.0
9784,Malakal,2014-03-01,,2.0,1.0,3.0
9785,Malakal,2014-04-01,3.0,2.0,1.0,4.0
9786,Malakal,2014-05-01,,3.0,2.0,4.0
9787,Malakal,2014-06-01,,3.0,2.0,4.0
9788,Malakal,2014-07-01,4.0,3.0,2.0,2.0
9789,Malakal,2014-08-01,,4.0,3.0,2.0


In [31]:
df_pred = df_tabular[df_tabular.ipc.isna()][['ndvi_mean', 'ndvi_anom', 'rain_mean',
       'rain_anom', 'et_mean', 'et_anom', 'count_violence', 'sum_fatalities',
       'food_price_idx', 'area', 'cropland_pct', 'pop', 'ruggedness_mean', 'pasture_pct',
                 'prev_ipc', '2prev_ipc', 'next_ipc']]

In [32]:
preds = forest_clf_2.predict(df_pred)

In [33]:
df_tabular[df_tabular.ipc.isna()].index

Int64Index([   4,    5,    7,    8,   10,   11,   13,   14,   16,   17,
            ...
            9842, 9844, 9845, 9846, 9848, 9849, 9850, 9852, 9853, 9854],
           dtype='int64', length=6699)

In [34]:
preds=pd.DataFrame(preds)
preds.set_index(keys=df_tabular[df_tabular.ipc.isna()].index, inplace=True)
preds.columns = ['pred']

In [35]:
df_tabular['ipc'].fillna(value=pd.Series(preds['pred']), inplace=True)

In [36]:
df_tabular[df_tabular.district=='Malakal'][['district', 'year_month', 'ipc', 'prev_ipc', '2prev_ipc', 'next_ipc']].loc[9780:9800]

Unnamed: 0,district,year_month,ipc,prev_ipc,2prev_ipc,next_ipc
9780,Malakal,2013-11-01,2.0,1.0,2.0,2.0
9781,Malakal,2013-12-01,1.0,1.0,2.0,2.0
9782,Malakal,2014-01-01,2.0,1.0,2.0,3.0
9783,Malakal,2014-02-01,2.0,2.0,1.0,3.0
9784,Malakal,2014-03-01,2.0,2.0,1.0,3.0
9785,Malakal,2014-04-01,3.0,2.0,1.0,4.0
9786,Malakal,2014-05-01,3.0,3.0,2.0,4.0
9787,Malakal,2014-06-01,3.0,3.0,2.0,4.0
9788,Malakal,2014-07-01,4.0,3.0,2.0,2.0
9789,Malakal,2014-08-01,3.0,4.0,3.0,2.0


In [38]:
df_tabular.to_csv('data/food_crises_interpol')

# Plot IPC progression