# Communal Conflict model in Ethiopia 
## Ethiopia, admin2, monthly data
### 3. XGBoost Classifier 

In [1]:
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd


In [2]:
#os.chdir('/home/ubuntu/darpa/models/conflict_probability')

In [3]:
url = 'https://data.kimetrica.com/dataset/4dbc3cc7-9474-49f2-bfd4-231e78401caa/resource/7423b71d-ce8c-437c-9fe6-2d9ba58d6155/download/dataset_communal_cnflict_model.csv'
df = pd.read_csv(url, index_col=0)
list(df.columns)

['admin1',
 'admin2',
 'year',
 'month',
 'fatalities',
 'cc_event_count',
 'actor_state',
 'actor_rebel_groups',
 'actor_political_militias',
 'actor_identity_militias',
 'actor_civilians',
 'actor_others',
 'cc_onset',
 'admin0',
 'inflation_all',
 'inflation_food',
 'inflation_non_food',
 'rainfall',
 'rainfall_lag',
 'temperature mean',
 'btotl',
 'maize_ETB_KG',
 'teff_ETB_KG',
 'wheat_ETB_KG',
 'mean ndvi',
 'ndvi_lag',
 'cluster']

In [4]:
df.tail()

Unnamed: 0,admin1,admin2,year,month,fatalities,cc_event_count,actor_state,actor_rebel_groups,actor_political_militias,actor_identity_militias,...,rainfall,rainfall_lag,temperature mean,btotl,maize_ETB_KG,teff_ETB_KG,wheat_ETB_KG,mean ndvi,ndvi_lag,cluster
15890,Tigray,Western Tigray,2020,8,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1722534.0,0.0,0.0,0.0,0.0,0.0,Tigray_Western Tigray
15891,Tigray,Western Tigray,2020,9,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1722534.0,0.0,0.0,0.0,0.0,0.0,Tigray_Western Tigray
15892,Tigray,Western Tigray,2020,10,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1722534.0,0.0,0.0,0.0,0.0,0.0,Tigray_Western Tigray
15893,Tigray,Western Tigray,2020,11,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1722534.0,0.0,0.0,0.0,0.0,0.0,Tigray_Western Tigray
15894,Tigray,Western Tigray,2020,12,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1722534.0,0.0,0.0,0.0,0.0,0.0,Tigray_Western Tigray


In [5]:
df['year']=df.year.astype(int)
df=df.loc[df['year']>2006]
df=df.loc[df['year']<2019]
df.shape

(7975, 27)

In [6]:
#Training and test 
train=df.loc[df['year']<2014]
test=df.loc[df['year']>2013]


In [7]:
list(train.columns)

['admin1',
 'admin2',
 'year',
 'month',
 'fatalities',
 'cc_event_count',
 'actor_state',
 'actor_rebel_groups',
 'actor_political_militias',
 'actor_identity_militias',
 'actor_civilians',
 'actor_others',
 'cc_onset',
 'admin0',
 'inflation_all',
 'inflation_food',
 'inflation_non_food',
 'rainfall',
 'rainfall_lag',
 'temperature mean',
 'btotl',
 'maize_ETB_KG',
 'teff_ETB_KG',
 'wheat_ETB_KG',
 'mean ndvi',
 'ndvi_lag',
 'cluster']

In [8]:
X_train = train[[
    'actor_state',
 'actor_rebel_groups',
 'actor_political_militias',
 'actor_civilians',
 'actor_others',
 'rainfall_lag',
    'temperature mean',
    'btotl',
    'inflation_all',
     'maize_ETB_KG',
 'teff_ETB_KG',
 'wheat_ETB_KG',
 'ndvi_lag'
  ]]

In [9]:
y_train = train['cc_onset']

In [10]:
X_test = test[[
    'actor_state',
 'actor_rebel_groups',
 'actor_political_militias',
 'actor_civilians',
 'actor_others',
 'rainfall_lag',
    'temperature mean',
    'btotl',
    'inflation_all',
     'maize_ETB_KG',
 'teff_ETB_KG',
 'wheat_ETB_KG',
 'ndvi_lag'
]]

In [11]:
y_test = test['cc_onset']

In [12]:
model1 = xgb.XGBClassifier()
model2 = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)

train_model1 = model1.fit(X_train, y_train)


In [13]:
#prediction and Classification Report
from sklearn.metrics import classification_report

pred1 = train_model1.predict(X_test)

In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred1))

              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99      3138
         1.0       1.00      0.43      0.60       162

    accuracy                           0.97      3300
   macro avg       0.99      0.71      0.79      3300
weighted avg       0.97      0.97      0.97      3300



# hyperparameter tuning of XGboost

In [17]:
#Let's do a little Gridsearch, Hyperparameter Tunning
model3 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [18]:
train_model3 = model3.fit(X_train, y_train)
pred3 = train_model3.predict(X_test)
print("Accuracy for model 3: %.2f" % (accuracy_score(y_test, pred3) * 100))


Accuracy for model 3: 96.09


In [19]:
from sklearn.model_selection import GridSearchCV

param_test = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

train_model4 = gsearch.fit(X_train, y_train)
pred4 = train_model4.predict(X_test)
print("Accuracy for model 4: %.2f" % (accuracy_score(y_test, pred4) * 100))



Accuracy for model 4: 95.09


In [20]:
param_test2b = {
 'min_child_weight':[6,8,10,12]
}
gsearch2b = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=4,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2b, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

train_model5 = gsearch2b.fit(X_train, y_train)
pred5 = train_model5.predict(X_test)
print("Accuracy for model 5: %.2f" % (accuracy_score(y_test, pred5) * 100))



Accuracy for model 5: 95.09


In [21]:
#Tune Gamma
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)

train_model6 = gsearch3.fit(X_train, y_train)
pred6 = train_model6.predict(X_test)
print("Accuracy for model 6: %.2f" % (accuracy_score(y_test, pred6) * 100))



Accuracy for model 6: 95.09


In [22]:
xgb2 = xgb.XGBClassifier(
 learning_rate =0.7,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=6,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

train_model7 = xgb2.fit(X_train, y_train)
pred7 = train_model7.predict(X_test)
print("Accuracy for model 7: %.2f" % (accuracy_score(y_test, pred7) * 100))

Accuracy for model 7: 95.09


In [23]:
#Let's train a fast RandomForest on the dataset
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc_model = rfc.fit(X_train, y_train)
pred8 = rfc_model.predict(X_test)
print("Accuracy for Random Forest Model: %.2f" % (accuracy_score(y_test, pred8) * 100))

Accuracy for Random Forest Model: 97.18
