In [111]:
import numpy as np
import pandas as pd
from inv_dict import wb_cow_dict

import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from statsmodels.regression.linear_model import OLS
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.inspection import plot_partial_dependence
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [112]:
def get_cc(val):
    if val in wb_cow_dict:
        return wb_cow_dict[val]
    else:
        return 0
    
def get_year(val):
    return int(val)

def add_wd_rows(reign_df, wdi_df, variable_list):
    joint_df = reign_df.copy()
    yearlist = [str(i) for i in np.arange(1960, 2020)]
    for i in variable_list:
        df = wdi_df[wdi_df['Indicator Name'] == i]
        dfx = pd.melt(df, id_vars = ['Country Name'], value_vars=yearlist)
        dfx['ccode'] = dfx['Country Name'].apply(get_cc)
        dfx['year'] = dfx['variable'].apply(get_year)
        dfx['yearcode'] = (dfx['year']) + 10000*dfx['ccode']
        dfx[i] = dfx['value']
        dfx_limited = dfx[[i, 'yearcode']]
        joint_df = joint_df.join(dfx_limited.set_index('yearcode'), on='yearcode', how = 'inner')
    return joint_df

In [113]:
def upsampler(X_train, y_train, target = 'pt_attempt', ratio = 1.0):
    '''
    Args: X_train and y_train
    Optional: what is the target
    Returns: y_train, and X_train with the target rows sampled with replacement to equal 
    the number of non-target rows (makes X_train much bigger)
    '''
    y_train = pd.Series(y_train)
    
    X = pd.concat([X_train, y_train], axis=1) 
    no_coup = X[X[target]==0]
    coup = X[X[target]==1]
    coups_upsampled = resample(coup,
                          replace=True, # sample with replacement
                          n_samples=int(len(no_coup)*ratio), # match number in majority class
                          random_state=29)
    upsampled = pd.concat([no_coup, coups_upsampled])
    y_up = upsampled[target]
    X_up = upsampled.drop(target, axis = 1)
    return X_up, y_up

def metric_test(model, X_test, y_test):
    '''
    Prints out the accuracy, recall, precision, and f1 score for the 
    fit model when it predicts on the test data
    '''
    preds = model.predict(X_test)
    print('accuracy = ' + str(accuracy_score(y_test, preds)))
    print('recall = ' + str(recall_score(y_test, preds)))
    print('precision = ' + str(precision_score(y_test, preds)))
    print('f1 score = ' + str(f1_score(y_test, preds)))
    
def get_feature_weights(model, feature_labels):
    '''
    returns coefficients for features in a model (intended for logistic regression) 
    args: model, feature_labels
    returns: a sorted series in ascending order of feature weights.
    '''
    d_log_vals = {}
    for idx, feat in enumerate(model.coef_[0]):
        d_log_vals[feature_labels[idx]] = feat  
    s_log_vals = (pd.Series(d_log_vals)).sort_values()
    return s_log_vals

In [114]:
variable_list = ['Life expectancy at birth, female (years)', 'GDP growth (annual %)', 'Mineral rents (% of GDP)', 'Oil rents (% of GDP)', 'Trade (% of GDP)', 'Foreign direct investment, net inflows (% of GDP)', 'Natural gas rents (% of GDP)', 'Population ages 0-14 (% of total population)', 'Rural population (% of total population)',  'Population growth (annual %)', 'Arable land (hectares per person)',
 'Merchandise exports (current US$)',
 'Merchandise imports (current US$)',
 'Primary education, duration (years)']

In [115]:
wdi_df = pd.read_pickle('../data/wdi_complete.pkl')

In [116]:
reign_df = pd.read_pickle('../data/year_agg.pkl')
dummies = pd.get_dummies(reign_df['government'])
df_dumb = reign_df.join(dummies)
df_dumb['pt_attempt'] = df_dumb['coupyear']
df_dumb['pt_suc'] = df_dumb['coupsuc']
df = df_dumb.drop(['ccode', 'country', 'leader', 'month', 'government', 'coupyear', 'coupsuc'], axis = 1)

In [117]:
joint_df = add_wd_rows(df, wdi_df, variable_list)

In [118]:
joint_df_thinner = add_wd_rows(df, wdi_df, variable_list)

In [119]:
joint_df_thinner.dropna()

Unnamed: 0,year,elected,age,male,militarycareer,tenure_months,anticipation,ref_ant,leg_ant,exec_ant,irreg_lead_ant,election_now,election_recent,leg_recent,exec_recent,lead_recent,ref_recent,direct_recent,indirect_recent,victory_recent,defeat_recent,change_recent,nochange_recent,delayed,lastelection,loss,irregular,prev_conflict,precip,yearcode,Dominant Party,Foreign/Occupied,Indirect Military,Military,Military-Personal,Monarchy,Oligarchy,Parliamentary Democracy,Party-Military,Party-Personal,Party-Personal-Military Hybrid,Personal Dictatorship,Presidential Democracy,Provisional - Civilian,Provisional - Military,Warlordism,pt_attempt,pt_suc,"Life expectancy at birth, female (years)",GDP growth (annual %),Mineral rents (% of GDP),Oil rents (% of GDP),Trade (% of GDP),"Foreign direct investment, net inflows (% of GDP)",Natural gas rents (% of GDP),Population ages 0-14 (% of total population),Rural population (% of total population),Population growth (annual %),Arable land (hectares per person),Merchandise exports (current US$),Merchandise imports (current US$),"Primary education, duration (years)"
23,1970.0,1.0,57.0,1,0.0,13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.708050,2.708050,7.682943,0.0,-0.409222,21970.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False,74.700,-0.254080,0.172617,0.378599,10.758285,0.113668,0.008250,28.097460,26.398,1.165003,0.920425,4.322500e+10,4.242800e+10,6.0
24,1971.0,1.0,58.0,1,0.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.295837,3.295837,7.688455,0.0,-0.074137,21971.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False,75.000,3.293362,0.097731,0.405338,10.757179,0.066103,0.016760,27.474503,26.387,1.264334,0.905996,4.354900e+10,4.834200e+10,6.0
25,1972.0,1.0,59.0,1,0.0,37.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.663562,3.663562,7.693937,0.0,0.047622,21972.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False,75.100,5.258895,0.090264,0.398848,11.340620,0.099288,0.041659,26.869139,26.377,1.070523,0.893514,4.919900e+10,5.886200e+10,6.0
26,1973.0,1.0,60.0,1,0.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.098612,3.931826,7.699389,0.0,0.176349,21973.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False,75.300,5.645719,0.173750,0.418392,13.079286,0.135403,0.121473,26.262619,26.367,0.954477,0.882690,7.082300e+10,7.357200e+10,6.0
27,1974.0,1.0,61.0,1,0.0,61.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.708050,4.143135,7.704812,0.0,0.498646,21974.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False,75.900,-0.540547,0.283448,1.548155,16.444986,0.229090,1.041549,25.639885,26.357,0.913660,0.871959,9.943700e+10,1.104780e+11,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11116,2012.0,0.0,60.0,1,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.216606,6.216606,6.216606,0.0,0.274661,9552012.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,False,False,72.202,0.890323,0.000000,0.000000,77.769848,-0.178152,0.000000,37.028529,76.565,-0.799814,0.175205,1.600000e+07,1.990000e+08,6.0
11117,2013.0,0.0,61.0,1,0.0,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.240276,6.240276,6.240276,0.0,0.277648,9552013.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,False,False,72.282,-3.120604,0.000000,0.000000,82.932608,1.391103,0.000000,36.850483,76.618,-0.947661,0.176873,1.700000e+07,1.980000e+08,6.0
11118,2014.0,0.0,62.0,1,0.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.263398,6.263398,6.263398,0.0,0.322612,9552014.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,False,False,72.363,2.072607,0.000000,0.000000,74.240186,2.982906,0.000000,36.635388,76.672,-0.729801,0.178168,1.900000e+07,2.190000e+08,6.0
11119,2015.0,0.0,74.0,1,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.285998,6.285998,6.285998,0.0,-0.143210,9552015.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,False,False,72.449,3.711847,0.000000,0.000000,81.744735,2.881435,0.000000,36.406664,76.725,-0.244786,0.178605,1.800000e+07,2.090000e+08,6.0


In [120]:
sum(joint_df['pt_attempt'])

362

In [121]:
362/5939

0.060953022394342485

In [122]:
joint_df_thinner.columns

Index(['year', 'elected', 'age', 'male', 'militarycareer', 'tenure_months',
       'anticipation', 'ref_ant', 'leg_ant', 'exec_ant', 'irreg_lead_ant',
       'election_now', 'election_recent', 'leg_recent', 'exec_recent',
       'lead_recent', 'ref_recent', 'direct_recent', 'indirect_recent',
       'victory_recent', 'defeat_recent', 'change_recent', 'nochange_recent',
       'delayed', 'lastelection', 'loss', 'irregular', 'prev_conflict',
       'precip', 'yearcode', 'Dominant Party', 'Foreign/Occupied',
       'Indirect Military', 'Military', 'Military-Personal', 'Monarchy',
       'Oligarchy', 'Parliamentary Democracy', 'Party-Military',
       'Party-Personal', 'Party-Personal-Military Hybrid',
       'Personal Dictatorship', 'Presidential Democracy',
       'Provisional - Civilian', 'Provisional - Military', 'Warlordism',
       'pt_attempt', 'pt_suc', 'Life expectancy at birth, female (years)',
       'GDP growth (annual %)', 'Mineral rents (% of GDP)',
       'Oil rents (% of GD

In [123]:
joint_df_x = joint_df_thinner.dropna()

In [155]:
joint_df_x.shape

(5939, 62)

In [124]:
y = joint_df_x ['pt_attempt']
X = joint_df_x .drop(['pt_attempt','pt_suc'], axis = 1)

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)

In [126]:
ridge_scaled = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='l1', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,)

In [127]:
X_up, y_up = upsampler(X_train, y_train, ratio = 1)

In [128]:
logl1pipe = Pipeline([('scaler', StandardScaler()),('ridge_scaled', ridge_scaled)])
logl1pipe.fit(X_up, y_up)



Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge_scaled',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0, l1_ratios=None,
                                      max_iter=100, multi_class='warn',
                                      n_jobs=2, penalty='l1', random_state=None,
                                      refit=True, scoring='recall',
                                      solver='saga', tol=0.0001, verbose=0))],
         verbose=False)

In [66]:
metric_test(logl1pipe, X_test, y_test)

accuracy = 0.7504520795660036
recall = 0.6274509803921569
precision = 0.07494145199063232
f1 score = 0.13389121338912136


In [83]:
metric_test(logl1pipe, X_test, y_test)

accuracy = 0.7858585858585858
recall = 0.6304347826086957
precision = 0.08787878787878788
f1 score = 0.15425531914893617


In [84]:
get_feature_weights(ridge_scaled, X.columns)

election_recent                                     -1.386804
GDP growth (annual %)                               -0.914839
irregular                                           -0.689164
Foreign/Occupied                                    -0.595153
year                                                -0.428178
Oil rents (% of GDP)                                -0.325519
Monarchy                                            -0.286168
Oligarchy                                           -0.274196
election_now                                        -0.266318
Warlordism                                          -0.237600
indirect_recent                                     -0.221765
Parliamentary Democracy                             -0.221238
male                                                -0.220464
Indirect Military                                   -0.213672
Dominant Party                                      -0.200298
Trade (% of GDP)                                    -0.190735
loss    

In [129]:
joint_df_2x = joint_df_thinner.drop(['direct_recent', 'Merchandise imports (current US$)', 'Foreign direct investment, net inflows (% of GDP)', 'elected', 'Presidential Democracy'], axis =1)

In [130]:
joint_df_2 = joint_df_2x.dropna()

In [89]:
y = joint_df_2['pt_attempt']
X = joint_df_2.drop(['pt_attempt','pt_suc'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)
ridge_scaled = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='l1', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,)
X_up, y_up = upsampler(X_train, y_train, ratio = 1)
logl1pipe = Pipeline([('scaler', StandardScaler()),('ridge_scaled', ridge_scaled)])
logl1pipe.fit(X_up, y_up)
metric_test(logl1pipe, X_test, y_test)
get_feature_weights(ridge_scaled, X.columns)



accuracy = 0.7683653209794837
recall = 0.8478260869565217
precision = 0.10209424083769633
f1 score = 0.1822429906542056


GDP growth (annual %)                          -0.484639
irregular                                      -0.445288
Life expectancy at birth, female (years)       -0.257096
Monarchy                                       -0.224793
Dominant Party                                 -0.215266
year                                           -0.168444
Trade (% of GDP)                               -0.167209
Parliamentary Democracy                        -0.166837
male                                           -0.113961
election_recent                                -0.110919
victory_recent                                 -0.100807
Oligarchy                                      -0.079728
Oil rents (% of GDP)                           -0.054586
Foreign/Occupied                               -0.022874
yearcode                                       -0.014647
nochange_recent                                -0.006398
Party-Personal                                 -0.000402
ref_recent                     

In [131]:
new_drops = ['ref_recent',                                      
'ref_ant'     ,                                   
'Party-Military',                                  
'Party-Personal-Military Hybrid',                  
'Personal Dictatorship'          ,                 
'Provisional - Military'          ,                
'Warlordism'                       ,             
'anticipation'                      ,              
'tenure_months'                      ,             
'militarycareer'                      ,            
'age'                                  ,           
'Natural gas rents (% of GDP)'          ,          
'Rural population (% of total population)',        
'Arable land (hectares per person)'        ,       
'lead_recent'                               ,      
'exec_ant'                                   ,   
'leg_ant'                                     ,    
'leg_recent'                                   , 
'indirect_recent'                               ,  
'election_now'                                   ,
'Merchandise exports (current US$)'              ,
'precip'                                          ,
'defeat_recent'                                   ,
'prev_conflict'                                   ,
'exec_recent'                                     ,
'loss'                                            ,
'delayed'                                         ,
'change_recent']                                  

In [132]:
new_drops

['ref_recent',
 'ref_ant',
 'Party-Military',
 'Party-Personal-Military Hybrid',
 'Personal Dictatorship',
 'Provisional - Military',
 'Warlordism',
 'anticipation',
 'tenure_months',
 'militarycareer',
 'age',
 'Natural gas rents (% of GDP)',
 'Rural population (% of total population)',
 'Arable land (hectares per person)',
 'lead_recent',
 'exec_ant',
 'leg_ant',
 'leg_recent',
 'indirect_recent',
 'election_now',
 'Merchandise exports (current US$)',
 'precip',
 'defeat_recent',
 'prev_conflict',
 'exec_recent',
 'loss',
 'delayed',
 'change_recent']

In [133]:
joint_df_3x = joint_df_thinner.drop(['direct_recent', 'Merchandise imports (current US$)', 'Foreign direct investment, net inflows (% of GDP)', 'elected', 'Parliamentary Democracy', 'Primary education, duration (years)'], axis =1).drop(new_drops, axis =1)

In [134]:
joint_df_3 = joint_df_3x.dropna()

In [135]:
joint_df_3['constant'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [137]:
joint_df_3.columns

Index(['year', 'male', 'irreg_lead_ant', 'election_recent', 'victory_recent',
       'nochange_recent', 'lastelection', 'irregular', 'yearcode',
       'Dominant Party', 'Foreign/Occupied', 'Indirect Military', 'Military',
       'Military-Personal', 'Monarchy', 'Oligarchy', 'Party-Personal',
       'Presidential Democracy', 'Provisional - Civilian', 'pt_attempt',
       'pt_suc', 'Life expectancy at birth, female (years)',
       'GDP growth (annual %)', 'Mineral rents (% of GDP)',
       'Oil rents (% of GDP)', 'Trade (% of GDP)',
       'Population ages 0-14 (% of total population)',
       'Population growth (annual %)', 'constant'],
      dtype='object')

In [142]:
dfj2020 = pd.read_pickle('../data/updatedus2020.pkl')

In [144]:
dfj2020.shape

(1, 30)

In [148]:
y_dfj2020 = dfj2020['pt_attempt']
X_dfj2020 = dfj2020.drop(['pt_attempt','pt_suc'], axis = 1)

In [150]:
X.columns['irregular'] = 

Index(['year', 'male', 'irreg_lead_ant', 'election_recent', 'victory_recent',
       'nochange_recent', 'lastelection', 'irregular', 'yearcode',
       'Dominant Party', 'Foreign/Occupied', 'Indirect Military', 'Military',
       'Military-Personal', 'Monarchy', 'Oligarchy', 'Party-Personal',
       'Presidential Democracy', 'Provisional - Civilian',
       'Life expectancy at birth, female (years)', 'GDP growth (annual %)',
       'Mineral rents (% of GDP)', 'Oil rents (% of GDP)', 'Trade (% of GDP)',
       'Population ages 0-14 (% of total population)',
       'Population growth (annual %)', 'constant'],
      dtype='object')

In [149]:
X_dfj2020.columns

Index(['year', 'victory_recent', 'defeat_recent', 'change_recent',
       'nochange_recent', 'delayed', 'lastelection', 'loss', 'irregular',
       'prev_conflict', 'Life expectancy at birth, female (years)',
       'GDP growth (annual %)', 'Mineral rents (% of GDP)',
       'Oil rents (% of GDP)', 'Trade (% of GDP)',
       'Population ages 0-14 (% of total population)',
       'Population growth (annual %)', 'Dominant Party', 'Foreign/Occupied',
       'Indirect Military', 'Military', 'Military-Personal', 'Monarchy',
       'Oligarchy', 'Party-Personal', 'Presidential Democracy',
       'Provisional - Civilian', 'Constant'],
      dtype='object')

In [152]:
joint_df_3_late = joint_df_3[joint_df_3['year'] > 1974]

In [145]:
y = joint_df_3['pt_attempt']
X = joint_df_3.drop(['pt_attempt','pt_suc'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)
ridge_scaled = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='elasticnet', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,
        l1_ratios = [0, .3, .5, .7, 1])
X_up, y_up = upsampler(X_train, y_train, ratio = 1)
logl1pipe = Pipeline([('scaler', StandardScaler()),('ridge_scaled', ridge_scaled)])
logl1pipe.fit(X_up, y_up)
metric_test(logl1pipe, X_test, y_test)
get_feature_weights(ridge_scaled, X.columns)



accuracy = 0.7874608150470219
recall = 0.8163265306122449
precision = 0.10810810810810811
f1 score = 0.1909307875894988


irregular                                      -0.579705
GDP growth (annual %)                          -0.493169
Life expectancy at birth, female (years)       -0.335230
election_recent                                -0.296250
Monarchy                                       -0.253516
year                                           -0.252644
Trade (% of GDP)                               -0.180402
Dominant Party                                 -0.162729
Oil rents (% of GDP)                           -0.145236
Oligarchy                                      -0.105810
Foreign/Occupied                               -0.086063
Party-Personal                                 -0.069032
male                                           -0.023164
Presidential Democracy                          0.000000
constant                                        0.000000
yearcode                                        0.000000
nochange_recent                                 0.000000
victory_recent                 

In [153]:
joint_df_3_late = joint_df_3[joint_df_3['year'] > 1974]
y = joint_df_3_late['pt_attempt']
X = joint_df_3_late.drop(['pt_attempt','pt_suc'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)
ridge_scaled = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='elasticnet', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,
        l1_ratios = [0, .3, .5, .7, 1])
X_up, y_up = upsampler(X_train, y_train, ratio = 1)
logl1pipe = Pipeline([('scaler', StandardScaler()),('ridge_scaled', ridge_scaled)])
logl1pipe.fit(X_up, y_up)
metric_test(logl1pipe, X_test, y_test)
get_feature_weights(ridge_scaled, X.columns)



accuracy = 0.8017241379310345
recall = 0.7380952380952381
precision = 0.09717868338557993
f1 score = 0.17174515235457063




Monarchy                                       -1.536911
GDP growth (annual %)                          -0.838422
Foreign/Occupied                               -0.751012
irregular                                      -0.626264
Trade (% of GDP)                               -0.484362
Oligarchy                                      -0.433656
year                                           -0.229589
Party-Personal                                 -0.221986
male                                           -0.221208
Life expectancy at birth, female (years)       -0.195252
Oil rents (% of GDP)                           -0.172790
election_recent                                -0.141707
Dominant Party                                 -0.089693
yearcode                                       -0.037300
constant                                        0.000000
nochange_recent                                 0.000439
victory_recent                                  0.007712
Presidential Democracy         

In [169]:
logl1pipe.predict_proba(X)

array([[7.99240574e-01, 2.00759426e-01],
       [8.72600356e-01, 1.27399644e-01],
       [9.69160158e-01, 3.08398421e-02],
       ...,
       [9.99951352e-01, 4.86475240e-05],
       [9.99946368e-01, 5.36324369e-05],
       [9.99930544e-01, 6.94558047e-05]])

In [174]:
X_dfj2020.columns

Index(['year', 'victory_recent', 'defeat_recent', 'change_recent',
       'nochange_recent', 'delayed', 'lastelection', 'loss', 'irregular',
       'prev_conflict', 'Life expectancy at birth, female (years)',
       'GDP growth (annual %)', 'Mineral rents (% of GDP)',
       'Oil rents (% of GDP)', 'Trade (% of GDP)',
       'Population ages 0-14 (% of total population)',
       'Population growth (annual %)', 'Dominant Party', 'Foreign/Occupied',
       'Indirect Military', 'Military', 'Military-Personal', 'Monarchy',
       'Oligarchy', 'Party-Personal', 'Presidential Democracy',
       'Provisional - Civilian', 'Constant'],
      dtype='object')

In [177]:
X

Unnamed: 0,year,male,irreg_lead_ant,election_recent,victory_recent,nochange_recent,lastelection,irregular,yearcode,Dominant Party,Foreign/Occupied,Indirect Military,Military,Military-Personal,Monarchy,Oligarchy,Party-Personal,Presidential Democracy,Provisional - Civilian,"Life expectancy at birth, female (years)",GDP growth (annual %),Mineral rents (% of GDP),Oil rents (% of GDP),Trade (% of GDP),Population ages 0-14 (% of total population),Population growth (annual %),constant
28,1975.0,1,0.0,0.0,0.0,0.0,3.295837,7.710205,21975.0,0,0,0,0,0,0,0,0,1,0,76.600,-0.205464,0.191036,1.384686,15.516374,25.005105,0.985986,1
29,1976.0,1,0.0,0.0,0.0,0.0,3.663562,7.715570,21976.0,0,0,0,0,0,0,0,0,1,0,76.800,5.388139,0.116460,1.364022,16.048846,24.546818,0.950220,1
30,1977.0,1,0.0,1.0,0.0,0.0,1.098612,7.720905,21977.0,0,0,0,0,0,0,0,0,1,0,77.200,4.624159,0.152244,1.440894,16.417895,24.050240,1.005772,1
31,1977.0,1,0.0,1.0,0.0,0.0,1.098612,7.720905,21977.0,0,0,0,0,0,0,0,0,1,0,77.200,4.624159,0.152244,1.440894,16.417895,24.050240,1.005772,1
32,1978.0,1,0.0,0.0,0.0,0.0,2.708050,7.726213,21978.0,0,0,0,0,0,0,0,0,1,0,77.300,5.535303,0.064704,1.392541,16.972834,23.545173,1.059573,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11117,2013.0,1,0.0,0.0,0.0,0.0,6.240276,6.240276,9552013.0,0,0,0,0,0,1,0,0,0,0,72.282,-3.120604,0.000000,0.000000,82.932608,36.850483,-0.947661,1
11118,2014.0,1,0.0,0.0,0.0,0.0,6.263398,6.263398,9552014.0,0,0,0,0,0,1,0,0,0,0,72.363,2.072607,0.000000,0.000000,74.240186,36.635388,-0.729801,1
11119,2015.0,1,0.0,0.0,0.0,0.0,6.285998,6.285998,9552015.0,0,0,0,0,0,1,0,0,0,0,72.449,3.711847,0.000000,0.000000,81.744735,36.406664,-0.244786,1
11120,2016.0,1,0.0,0.0,0.0,0.0,6.308098,6.308098,9552016.0,0,0,0,0,0,1,0,0,0,0,72.544,3.379474,0.000000,0.000000,94.783586,36.130640,0.348664,1


In [184]:
us2020dict = {'year': [2020], 
'male': [1], 
'irreg_lead_ant': [0],
'election_recent': [0], 
'victory_recent': [0],
'nochange_recent': [0], 
'lastelection': [3.78419],
'irregular': [7.928766],
'yearcode': [22020], 
'Dominant Party': [0],
'Foreign/Occupied': [0], 
'Indirect Military': [0], 
'Military': [0], 
'Military-Personal': [0],
'Monarchy':[0], 
'Oligarchy': [0],
'Party-Personal': [0],
'Presidential Democracy': [1],
'Provisional - Civilian': [0], 
'Life expectancy at birth, female (years)': [81.1], 
'GDP growth (annual %)': [-5.0], 
'Mineral rents (% of GDP)': [0.075769],
'Oil rents (% of GDP)': [0.09245], 
'Trade (% of GDP)': [27.543903],
'Population ages 0-14 (% of total population)': [18.70904], 
'Population growth (annual %)': [0.522337],
'constant': [1]}

In [185]:
usj2020 = pd.DataFrame.from_dict(us2020dict)

In [186]:
logl1pipe.predict_proba(usj2020)

array([[0.89591693, 0.10408307]])

In [146]:
X.shape

(6377, 27)

In [99]:
y = joint_df_3['pt_attempt']
X = joint_df_3.drop(['pt_attempt','pt_suc'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)


In [102]:
clf = RandomForestClassifier(max_depth=5, n_estimators=1000)
rfpipe = Pipeline([('scaler', StandardScaler()),('rf', clf)])
X_up, y_up = upsampler(X_train, y_train, ratio = 1)
rfpipe.fit(X_up, y_up)
metric_test(rfpipe, X_test, y_test)
#get_feature_weights(rfpipe, X.columns)

accuracy = 0.799373040752351
recall = 0.6938775510204082
precision = 0.10029498525073746
f1 score = 0.1752577319587629


In [110]:
X.columns

Index(['year', 'male', 'irreg_lead_ant', 'election_recent', 'victory_recent',
       'nochange_recent', 'lastelection', 'irregular', 'yearcode',
       'Dominant Party', 'Foreign/Occupied', 'Indirect Military', 'Military',
       'Military-Personal', 'Monarchy', 'Oligarchy', 'Party-Personal',
       'Presidential Democracy', 'Provisional - Civilian',
       'Life expectancy at birth, female (years)', 'GDP growth (annual %)',
       'Mineral rents (% of GDP)', 'Oil rents (% of GDP)', 'Trade (% of GDP)',
       'Population ages 0-14 (% of total population)',
       'Population growth (annual %)', 'constant'],
      dtype='object')

In [157]:
logl1pipe

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge_scaled',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0,
                                      l1_ratios=[0, 0.3, 0.5, 0.7, 1],
                                      max_iter=100, multi_class='warn',
                                      n_jobs=2, penalty='elasticnet',
                                      random_state=None, refit=True,
                                      scoring='recall', solver='saga',
                                      tol=0.0001, verbose=0))],
         verbose=False)