In [9]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from statsmodels.regression.linear_model import OLS
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.inspection import plot_partial_dependence
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [52]:
def upsampler(X_train, y_train, target = 'pt_attempt'):
    '''
    Args: X_train and y_train
    Optional: what is the target
    Returns: y_train, and X_train with the target rows sampled with replacement to equal 
    the number of non-target rows (makes X_train much bigger)
    '''
    y_train = pd.Series(y_train)
    
    X = pd.concat([X_train, y_train], axis=1) 
    no_coup = X[X[target]==0]
    coup = X[X[target]==1]
    coups_upsampled = resample(coup,
                          replace=True, # sample with replacement
                          n_samples=len(no_coup), # match number in majority class
                          random_state=29)
    upsampled = pd.concat([no_coup, coups_upsampled])
    y_up = upsampled[target]
    X_up = upsampled.drop(target, axis = 1)
    return X_up, y_up

def metric_test(model, X_test, y_test):
    '''
    Prints out the accuracy, recall, precision, and f1 score for the 
    fit model when it predicts on the test data
    '''
    preds = model.predict(X_test)
    print('accuracy = ' + str(accuracy_score(y_test, preds)))
    print('recall = ' + str(recall_score(y_test, preds)))
    print('precision = ' + str(precision_score(y_test, preds)))
    print('f1 score = ' + str(f1_score(y_test, preds)))
    
def get_feature_weights(model, feature_labels):
    '''
    returns coefficients for features in a model (intended for logistic regression) 
    args: model, feature_labels
    returns: a sorted series in ascending order of feature weights.
    '''
    d_log_vals = {}
    for idx, feat in enumerate(model.coef_[0]):
        d_log_vals[feature_labels[idx]] = feat  
    s_log_vals = (pd.Series(d_log_vals)).sort_values()
    return s_log_vals

In [26]:
df_max = pd.read_pickle('../data/pickles/df_maximus.pkl')

In [27]:
df_max.shape

(7721, 67)

In [28]:
df = df_max.drop(['month'], axis =1).dropna()

In [29]:
y = df['pt_attempt']
X = df.drop(['pt_attempt','pt_suc'], axis = 1)

In [51]:
X.columns

Index(['year', 'elected', 'age', 'male', 'militarycareer', 'tenure_months',
       'anticipation', 'ref_ant', 'leg_ant', 'exec_ant', 'irreg_lead_ant',
       'election_now', 'election_recent', 'leg_recent', 'exec_recent',
       'lead_recent', 'ref_recent', 'direct_recent', 'indirect_recent',
       'victory_recent', 'defeat_recent', 'change_recent', 'nochange_recent',
       'delayed', 'lastelection', 'loss', 'irregular', 'prev_conflict',
       'precip',
       'Adolescent fertility rate (births per 1,000 women ages 15-19)',
       'Age dependency ratio (% of working-age population)',
       'Birth rate, crude (per 1,000 people)',
       'Death rate, crude (per 1,000 people)',
       'Fertility rate, total (births per woman)',
       'Life expectancy at birth, female (years)',
       'Life expectancy at birth, male (years)',
       'Mortality rate, adult, male (per 1,000 male adults)',
       'Population ages 0-14 (% of total population)',
       'Population growth (annual %)',
     

In [64]:
clf = RandomForestClassifier(max_depth=5, n_estimators=1000)

In [65]:
rfpipe = Pipeline([('scaler', StandardScaler()),('rf', clf)])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)

In [32]:
X_up, y_up = upsampler(X_train, y_train)

In [66]:
rfpipe.fit(X_up, y_up)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('rf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=5,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=1000, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [34]:
metric_test(rfpipe, X_test, y_test)

accuracy = 0.7226435536294691
recall = 0.7023809523809523
precision = 0.10805860805860806
f1 score = 0.1873015873015873


In [25]:
#metric_test(rfpipe, X_test, y_test)

accuracy = 0.7161430119176598
recall = 0.7023809523809523
precision = 0.1057347670250896
f1 score = 0.1838006230529595


In [39]:
ridge_scaled = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='l1', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,)

In [40]:
logl1pipe = Pipeline([('scaler', StandardScaler()),('ridge_scaled', ridge_scaled)])

In [41]:
logl1pipe.fit(X_up, y_up)



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge_scaled',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class='warn',
                                      n_jobs=2, penalty='l1', random_state=None,
                                      refit=True, scoring='recall',
                                      solver='saga', tol=0.0001, verbose=0))],
         verbose=False)

In [42]:
metric_test(logl1pipe, X_test, y_test)

accuracy = 0.7210184182015168
recall = 0.7023809523809523
precision = 0.10746812386156648
f1 score = 0.1864139020537125


In [50]:
len(ridge_scaled.coef_[0]), 

64

In [54]:
get_feature_weights(ridge_scaled, X.columns)

irregular                                                       -0.437488
Life expectancy at birth, female (years)                        -0.386778
upop                                                            -0.341833
cinc                                                            -0.323442
loss                                                            -0.262181
election_recent                                                 -0.261735
year                                                            -0.254180
Monarchy                                                        -0.246250
exports                                                         -0.234292
Oligarchy                                                       -0.230480
Dominant Party                                                  -0.219033
Rural population (% of total population)                        -0.205509
male                                                            -0.181997
precip                                

In [55]:
elastic_scaled = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='elasticnet', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,
        l1_ratios = [0, .3, .5, .7, 1])

In [56]:
logl1pipe2 = Pipeline([('scaler', StandardScaler()),('elastic_scaled', elastic_scaled)])

In [57]:
logl1pipe2.fit(X_up, y_up)



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('elastic_scaled',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0,
                                      l1_ratios=[0, 0.3, 0.5, 0.7, 1],
                                      max_iter=100, multi_class='warn',
                                      n_jobs=2, penalty='elasticnet',
                                      random_state=None, refit=True,
                                      scoring='recall', solver='saga',
                                      tol=0.0001, verbose=0))],
         verbose=False)

In [59]:
metric_test(logl1pipe, X_test, y_test)

accuracy = 0.7210184182015168
recall = 0.7023809523809523
precision = 0.10746812386156648
f1 score = 0.1864139020537125


In [60]:
metric_test(rfpipe, X_test, y_test)

accuracy = 0.7226435536294691
recall = 0.7023809523809523
precision = 0.10805860805860806
f1 score = 0.1873015873015873


In [76]:
df2 = df_max.drop(['month', 'pec'], axis =1).dropna()
y2 = df2['pt_attempt']
X2 = df2.drop(['pt_attempt','pt_suc'], axis = 1)

In [77]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size= .25, random_state= 40, stratify = y2)

In [78]:
X_up2, y_up2 = upsampler(X_train2, y_train2)

In [79]:
ridge_scaled2 = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='l1', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,)

In [80]:
logl1pipe2 = Pipeline([('scaler', StandardScaler()),('ridge_scaled', ridge_scaled2)])

In [81]:
logl1pipe2.fit(X_up2, y_up2)



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge_scaled',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class='warn',
                                      n_jobs=2, penalty='l1', random_state=None,
                                      refit=True, scoring='recall',
                                      solver='saga', tol=0.0001, verbose=0))],
         verbose=False)

In [82]:
get_feature_weights(ridge_scaled2, X2.columns)

irregular                                                       -0.430606
upop                                                            -0.404527
Life expectancy at birth, female (years)                        -0.381239
election_recent                                                 -0.269523
loss                                                            -0.268412
year                                                            -0.242800
Monarchy                                                        -0.237570
Oligarchy                                                       -0.226239
Rural population (% of total population)                        -0.217598
Dominant Party                                                  -0.210470
male                                                            -0.175486
exports                                                         -0.106335
precip                                                          -0.096744
Party-Personal                        

In [83]:
metric_test(logl1pipe2, X_test2, y_test2)

accuracy = 0.7215601300108342
recall = 0.7023809523809523
precision = 0.10766423357664233
f1 score = 0.18670886075949364


In [85]:
lasso_tanglers = ['cinc' ,                                                            
'Mortality rate, adult, male (per 1,000 male adults)',              
'Life expectancy at birth, male (years)',                           
'imports',                                                         
'Death rate, crude (per 1,000 people)',                             
'milper',                                                           
'Presidential Democracy',                                           
'irst',                                                             
'Foreign/Occupied',                                                 
'Birth rate, crude (per 1,000 people)',                             
'exec_recent',                                                      
'lead_recent',                                                     
'nochange_recent',                                                  
'anticipation',                                                     
'defeat_recent',                                                    
'election_now',                                                     
'direct_recent',                                                    
'indirect_recent']                                              

In [96]:
df3 = df2.drop(lasso_tanglers, axis =1).dropna()


In [97]:
y3 = df3['pt_attempt']
X3 = df3.drop(['pt_attempt','pt_suc'], axis = 1)

In [98]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size= .25, random_state= 40, stratify = y3)

In [99]:
X_up3, y_up3 = upsampler(X_train3, y_train3)

In [105]:
ridge_scaled3 = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='elasticnet', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,
        l1_ratios = [0, .3, .5, .7, 1])

In [106]:
logl1pipe3 = Pipeline([('scaler', StandardScaler()),('ridge_scaled', ridge_scaled3)])

In [107]:
logl1pipe3.fit(X_up3, y_up3)



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge_scaled',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0,
                                      l1_ratios=[0, 0.3, 0.5, 0.7, 1],
                                      max_iter=100, multi_class='warn',
                                      n_jobs=2, penalty='elasticnet',
                                      random_state=None, refit=True,
                                      scoring='recall', solver='saga',
                                      tol=0.0001, verbose=0))],
         verbose=False)

In [108]:
metric_test(logl1pipe3, X_test3, y_test3)

accuracy = 0.7210184182015168
recall = 0.7142857142857143
precision = 0.1088929219600726
f1 score = 0.1889763779527559


In [109]:
get_feature_weights(ridge_scaled3, X3.columns)

irregular                                                       -0.369919
Life expectancy at birth, female (years)                        -0.287438
year                                                            -0.214759
upop                                                            -0.203516
loss                                                            -0.202883
Dominant Party                                                  -0.195510
Monarchy                                                        -0.183336
Oligarchy                                                       -0.151417
male                                                            -0.128644
election_recent                                                 -0.084656
precip                                                          -0.066110
leg_recent                                                      -0.064903
Rural population (% of total population)                        -0.062591
Party-Personal                        

In [110]:
lasso_drops_2 = ['Adolescent fertility rate (births per 1,000 women ages 15-19)',   
'milex',                                                            
'Population growth (annual %)',                                    
'Age dependency ratio (% of working-age population)',               
'elected',                                                          
'change_recent',                                                    
'victory_recent']                                                 

In [111]:
df4 = df3.drop(lasso_drops_2, axis =1).dropna()


In [115]:
y4 = df4['pt_attempt']
X4 = df4.drop(['pt_attempt','pt_suc'], axis = 1)

X_train4, X_test4, y_train4, y_test4 = train_test_split(X4, y4, test_size= .25, random_state= 40, stratify = y4)



In [120]:
X_train4.shape

(5537, 38)

In [121]:
X_test4.shape

(1846, 38)

In [122]:
y_train4.shape

(5537,)

In [123]:
y_test4.shape

(1846,)

In [118]:
X_up4, y_up4 = upsampler(X_train4, y_train4)


In [125]:
X_up4.shape

(10568, 38)

In [126]:
y_up4.shape

(10568,)

In [127]:
ridge_scaled4 = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='elasticnet', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,
        l1_ratios = [0, .3, .5, .7, 1])

logl1pipe4 = Pipeline([('scaler', StandardScaler()),('ridge_scaled', ridge_scaled4)])

In [128]:
logl1pipe4.fit(X_up4, y_up4)



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge_scaled',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0,
                                      l1_ratios=[0, 0.3, 0.5, 0.7, 1],
                                      max_iter=100, multi_class='warn',
                                      n_jobs=2, penalty='elasticnet',
                                      random_state=None, refit=True,
                                      scoring='recall', solver='saga',
                                      tol=0.0001, verbose=0))],
         verbose=False)

In [129]:
metric_test(logl1pipe4, X_test4, y_test4)

accuracy = 0.7210184182015168
recall = 0.7142857142857143
precision = 0.1088929219600726
f1 score = 0.1889763779527559


In [130]:
get_feature_weights(ridge_scaled4, X4.columns)

irregular                                      -0.369920
Life expectancy at birth, female (years)       -0.287451
year                                           -0.214754
upop                                           -0.203510
loss                                           -0.202876
Dominant Party                                 -0.195519
Monarchy                                       -0.183342
Oligarchy                                      -0.151418
male                                           -0.128643
election_recent                                -0.084658
precip                                         -0.066109
leg_recent                                     -0.064901
Rural population (% of total population)       -0.062598
Party-Personal                                 -0.060643
exports                                        -0.056344
leg_ant                                        -0.049090
tenure_months                                  -0.045822
Party-Personal-Military Hybrid 