In [209]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier


In [206]:
def upsampler(X_train, y_train, target = 'coup', ratio = 1.0):
    '''
    Args: X_train and y_train
    Optional: what is the target
    Returns: y_train, and X_train with the target rows sampled with replacement to equal 
    the number of non-target rows (makes X_train much bigger)
    '''
    y_train = pd.Series(y_train)
    
    X = pd.concat([X_train, y_train], axis=1) 
    no_coup = X[X[target]==0]
    coup = X[X[target]==1]
    coups_upsampled = resample(coup,
                          replace=True, # sample with replacement
                          n_samples=int(len(no_coup)*ratio), # match number in majority class
                          random_state=30)
    upsampled = pd.concat([no_coup, coups_upsampled])
    y_up = upsampled[target]
    X_up = upsampled.drop(target, axis = 1)
    return X_up, y_up

def metric_test(model, X_test, y_test):
    '''
    Prints out the accuracy, recall, precision, and f1 score for the 
    fit model when it predicts on the test data
    '''
    preds = model.predict(X_test)
    print('accuracy = ' + str(accuracy_score(y_test, preds)))
    print('recall = ' + str(recall_score(y_test, preds)))
    print('precision = ' + str(precision_score(y_test, preds)))
    print('f1 score = ' + str(f1_score(y_test, preds)))
    
def get_feature_weights(model, feature_labels):
    '''
    returns coefficients for features in a model (intended for logistic regression) 
    args: model, feature_labels
    returns: a sorted series in ascending order of feature weights.
    '''
    d_log_vals = {}
    for idx, feat in enumerate(model.coef_[0]):
        d_log_vals[feature_labels[idx]] = feat  
    s_log_vals = (pd.Series(d_log_vals)).sort_values()
    return s_log_vals

In [186]:
df = pd.read_csv('../data/REIGN_2020_6.csv')

In [187]:
df['precip_sq'] = df['precip']**2

In [188]:
df['democracy'] = (df['government'] == 'Parliamentary Democracy') + (df['government'] == 'Presidential Democracy')

  op=op_str, alt_op=unsupported[op_str]


In [189]:
df['military'] = (df['government'] == 'Indirect Military') + (df['government'] == 'Military') + (df['government'] == 'Military-Personal') + (df['government'] == 'Party-Military') + (df['government'] == 'Party-Personal-Military Hybrid') + + (df['government'] == 'Provisional - Military')

In [190]:
df['personal'] = ((df['government'] == 'Military-Personal') 
+ (df['government'] == 'Monarchy') 
+ (df['government'] == 'Party-Personal') 
+ (df['government'] == 'Party-Personal-Military Hybrid') 
+ (df['government'] == 'Personal Dictatorship') 
+ (df['government'] == 'Presidential Democracy'))

In [191]:
df['provisional'] = ((df['government'] == 'Provisional - Civilian') 
+ (df['government'] == 'Provisional - Military'))

In [192]:
df.columns

Index(['ccode', 'country', 'leader', 'year', 'month', 'elected', 'age', 'male',
       'militarycareer', 'tenure_months', 'government', 'anticipation',
       'ref_ant', 'leg_ant', 'exec_ant', 'irreg_lead_ant', 'election_now',
       'election_recent', 'leg_recent', 'exec_recent', 'lead_recent',
       'ref_recent', 'direct_recent', 'indirect_recent', 'victory_recent',
       'defeat_recent', 'change_recent', 'nochange_recent', 'delayed',
       'lastelection', 'loss', 'irregular', 'prev_conflict', 'pt_suc',
       'pt_attempt', 'precip', 'couprisk', 'pctile_risk', 'precip_sq',
       'democracy', 'military', 'personal', 'provisional'],
      dtype='object')

In [193]:
df['yearcode'] = df['year'] + df['ccode']*10000

In [194]:
df_jan = df[df['month']==1]

In [195]:
coupyears = df.groupby('yearcode')['pt_attempt'].sum()

In [196]:
df_joined = df_jan.join(coupyears, on = 'yearcode', rsuffix = '_year')

In [197]:
df_joined['coup'] = (df_joined['pt_attempt_year'] > 0)

In [198]:
df = df_joined

In [199]:
df.drop(['pt_suc', 'pt_attempt', 'irregular',  'couprisk', 'pctile_risk', 'precip',  'country', 'leader', 'government', 'ccode', 'yearcode', 'pt_attempt_year'], axis = 1, inplace = True)

In [200]:
df_early = df[df['year']<1975]

In [201]:
df_late = df[df['year']>1974]

In [202]:
df_early

Unnamed: 0,year,month,elected,age,male,militarycareer,tenure_months,anticipation,ref_ant,leg_ant,...,delayed,lastelection,loss,prev_conflict,precip_sq,democracy,military,personal,provisional,coup
0,1950.0,1.0,1.0,66.0,1,0.0,58.0,0.0,0.0,0.0,...,0.0,2.639057,5.327876,0.0,0.004769,True,False,True,False,False
12,1951.0,1.0,1.0,67.0,1,0.0,70.0,0.0,0.0,0.0,...,0.0,3.258097,5.384495,0.0,0.041035,True,False,True,False,False
24,1952.0,1.0,1.0,68.0,1,0.0,82.0,0.0,0.0,0.0,...,0.0,3.637586,5.438079,0.0,0.155530,True,False,True,False,False
36,1953.0,1.0,1.0,69.0,1,0.0,94.0,0.0,0.0,0.0,...,0.0,1.098612,1.098612,0.0,0.442663,True,False,True,False,False
37,1953.0,1.0,1.0,63.0,1,1.0,1.0,0.0,0.0,0.0,...,0.0,1.098612,1.098612,0.0,0.442663,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135216,1970.0,1.0,0.0,49.0,1,0.0,97.0,1.0,0.0,1.0,...,0.0,3.583519,4.584968,0.0,0.027228,True,False,False,False,False
135229,1971.0,1.0,0.0,49.0,1,0.0,12.0,0.0,0.0,0.0,...,0.0,2.484907,2.484907,0.0,0.148361,True,False,False,False,False
135241,1972.0,1.0,0.0,50.0,1,0.0,24.0,0.0,0.0,0.0,...,0.0,3.178054,3.178054,0.0,0.499381,True,False,False,False,False
135253,1973.0,1.0,0.0,51.0,1,0.0,36.0,1.0,0.0,1.0,...,0.0,3.583519,3.583519,0.0,0.001855,True,False,False,False,False


In [223]:
elastic = LogisticRegressionCV(
            cv=5, dual=False,
            penalty='elasticnet', 
            scoring='accuracy',
            solver='saga', 
            n_jobs = 2,
            tol=0.001,
            max_iter=200,
            l1_ratios = [0, .3, .5, .7, 1])

In [237]:
extra = ExtraTreesClassifier(n_estimators=100, max_depth = 5)

In [287]:
rf = RandomForestClassifier(n_estimators=100, max_depth = 5)

In [283]:
y = df_early['coup']
X = df_early.drop('coup', axis =1)

In [290]:
X_late = df_late.drop('coup', axis =1)

In [284]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)
X_up, y_up = upsampler(X_train, y_train, ratio = 1)

In [285]:
extra.fit(X_up, y_up,)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=5, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [288]:
rf.fit(X_up, y_up,)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [286]:
metric_test(extra, X_test, y_test)

accuracy = 0.7969401947148818
recall = 0.54
precision = 0.18
f1 score = 0.27


In [289]:
metric_test(rf, X_test, y_test)

accuracy = 0.7329624478442281
recall = 0.6
precision = 0.1485148514851485
f1 score = 0.23809523809523808


In [296]:
(extra.predict_proba(X_late)[:, 0])

array([0.51829363, 0.52179895, 0.6246637 , ..., 0.70206552, 0.71512982,
       0.71944601])

In [294]:
df_late

Unnamed: 0,year,month,elected,age,male,militarycareer,tenure_months,anticipation,ref_ant,leg_ant,...,delayed,lastelection,loss,prev_conflict,precip_sq,democracy,military,personal,provisional,coup
305,1975.0,1.0,0.0,62.0,1,0.0,6.0,0.0,0.0,0.0,...,0.0,3.295837,4.317488,0.0,0.084402,True,False,True,False,False
317,1976.0,1.0,0.0,63.0,1,0.0,18.0,0.0,0.0,0.0,...,0.0,3.663562,4.465908,0.0,0.046742,True,False,True,False,False
329,1977.0,1.0,0.0,64.0,1,0.0,30.0,0.0,0.0,0.0,...,0.0,1.098612,1.098612,0.0,0.690019,True,False,True,False,False
330,1977.0,1.0,1.0,53.0,1,0.0,1.0,0.0,0.0,0.0,...,0.0,1.098612,1.098612,0.0,0.690019,True,False,True,False,False
342,1978.0,1.0,1.0,54.0,1,0.0,13.0,0.0,0.0,0.0,...,0.0,2.708050,2.708050,0.0,0.071481,True,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135779,2016.0,1.0,1.0,71.0,1,0.0,206.0,1.0,0.0,1.0,...,0.0,4.077538,5.817111,0.0,0.153210,True,False,False,False,False
135791,2017.0,1.0,1.0,72.0,1,0.0,218.0,0.0,0.0,0.0,...,0.0,2.397895,5.852202,0.0,0.000665,True,False,False,False,False
135803,2018.0,1.0,1.0,73.0,1,0.0,230.0,0.0,0.0,0.0,...,0.0,3.135494,5.886104,0.0,0.077359,True,False,False,False,False
135815,2019.0,1.0,1.0,74.0,1,0.0,242.0,0.0,0.0,0.0,...,0.0,3.555348,5.918894,0.0,0.044480,True,False,False,False,False


In [228]:
elastic.predict_proba(X_test)

array([[0.45013739, 0.54986261],
       [0.50091818, 0.49908182],
       [0.66869517, 0.33130483],
       ...,
       [0.57576569, 0.42423431],
       [0.42095614, 0.57904386],
       [0.4303246 , 0.5696754 ]])

In [None]:
for i in np.arange(0, 1.1, .1):
    for thresh in [.5]:
        preds = 1*((i*extra.predict_proba(X_test)[:, 0] + (1-i)*elastic.predict_proba(X_test)[:, 0]) > thresh)
        print('====' + str(i) + '====')
        print('accuracy = ' + str(accuracy_score(y_test, preds)))
        print('recall = ' + str(recall_score(y_test, preds)))
        print('precision = ' + str(precision_score(y_test, preds)))
        print('f1 score = ' + str(f1_score(y_test, preds)))

In [266]:
for thresh in [.3, .4, .5, .6, .7]:
    preds = 1*((elastic.predict_proba(X_test)[:, 0] > thresh))
    print('====' + str(thresh) + '====')
    print('accuracy = ' + str(accuracy_score(y_test, preds)))
    print('recall = ' + str(recall_score(y_test, preds)))
    print('precision = ' + str(precision_score(y_test, preds)))
    print('f1 score = ' + str(f1_score(y_test, preds)))       
    

====0.3====
accuracy = 0.06675938803894298
recall = 0.96
precision = 0.06694560669456066
f1 score = 0.12516297262059972
====0.4====
accuracy = 0.1070931849791377
recall = 0.74
precision = 0.05555555555555555
f1 score = 0.10335195530726256
====0.5====
accuracy = 0.4534075104311544
recall = 0.2
precision = 0.027548209366391185
f1 score = 0.048426150121065374
====0.6====
accuracy = 0.7385257301808067
recall = 0.06
precision = 0.020833333333333332
f1 score = 0.03092783505154639
====0.7====
accuracy = 0.8595271210013908
recall = 0.02
precision = 0.018867924528301886
f1 score = 0.019417475728155338


In [249]:
preds

array([0.58822784, 0.42302108, 0.55811316, 0.6177651 , 0.59622337,
       0.37056642, 0.65838743, 0.60549989, 0.52344858, 0.52266719,
       0.54609897, 0.47885354, 0.39415093, 0.66387789, 0.61038372,
       0.51728448, 0.50082935, 0.50648333, 0.45695622, 0.43960515,
       0.4485176 , 0.57094546, 0.48297842, 0.55386795, 0.51110083,
       0.55051138, 0.60802164, 0.69821613, 0.49216329, 0.57753614,
       0.60558562, 0.62696828, 0.49747618, 0.60298944, 0.26293652,
       0.51360975, 0.55922676, 0.52079833, 0.61908133, 0.33800634,
       0.44647829, 0.51585754, 0.42494965, 0.60304268, 0.28833013,
       0.55671943, 0.61773291, 0.75746698, 0.54929119, 0.51042234,
       0.50417581, 0.53557507, 0.6441242 , 0.41138863, 0.51350409,
       0.50411588, 0.60580295, 0.60312757, 0.59958068, 0.62057249,
       0.6046677 , 0.84818657, 0.85123978, 0.62710341, 0.48329362,
       0.54620279, 0.57220461, 0.33032623, 0.52427509, 0.60542685,
       0.50410541, 0.62868475, 0.52506973, 0.57696152, 0.54907

In [244]:
extra.predict_proba(X_test)[:, 0] + elastic.predict_proba(X_test)[:, 0]

719

In [229]:
for i in np.arange(.1, 1, .05):
    print('====')
    print(i)
    print(recall_score(y_test, 1*(elastic.predict_proba(X_test)[:, 1] > i)))
    

====
0.1
1.0
====
0.15000000000000002
0.98
====
0.20000000000000004
0.98
====
0.25000000000000006
0.98
====
0.30000000000000004
0.98
====
0.3500000000000001
0.96
====
0.40000000000000013
0.94
====
0.45000000000000007
0.9
====
0.5000000000000001
0.8
====
0.5500000000000002
0.52
====
0.6000000000000002
0.26
====
0.6500000000000001
0.06
====
0.7000000000000002
0.04
====
0.7500000000000002
0.0
====
0.8000000000000002
0.0
====
0.8500000000000002
0.0
====
0.9000000000000002
0.0
====
0.9500000000000003
0.0


In [297]:
get_feature_weights(elastic, X.columns)

democracy         -0.226589
loss              -0.173644
elected           -0.061259
lead_recent       -0.025088
leg_recent        -0.021930
age               -0.016880
election_recent   -0.016458
direct_recent     -0.015344
leg_ant           -0.013816
nochange_recent   -0.010318
victory_recent    -0.006846
precip_sq         -0.006769
defeat_recent     -0.005468
tenure_months     -0.002745
change_recent     -0.002196
exec_recent       -0.001898
indirect_recent   -0.000298
year               0.000360
election_now       0.001126
month              0.003254
male               0.003342
ref_recent         0.009409
ref_ant            0.012419
delayed            0.020994
irreg_lead_ant     0.035029
provisional        0.035206
anticipation       0.057507
exec_ant           0.060919
prev_conflict      0.084309
personal           0.104095
lastelection       0.121021
military           0.143069
militarycareer     0.189487
dtype: float64