In [216]:
import numpy as np
import pandas as pd
from inv_dict import wb_cow_dict

import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from statsmodels.regression.linear_model import OLS
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.inspection import plot_partial_dependence
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [217]:
RandomState = 40

In [218]:
def get_cc(val):
    if val in wb_cow_dict:
        return wb_cow_dict[val]
    else:
        return 0
    
def get_year(val):
    return int(val)

def add_wd_rows(reign_df, wdi_df, variable_list):
    joint_df = reign_df.copy()
    yearlist = [str(i) for i in np.arange(1960, 2020)]
    for i in variable_list:
        df = wdi_df[wdi_df['Indicator Name'] == i]
        dfx = pd.melt(df, id_vars = ['Country Name'], value_vars=yearlist)
        dfx['ccode'] = dfx['Country Name'].apply(get_cc)
        dfx['year'] = dfx['variable'].apply(get_year)
        dfx['yearcode'] = (dfx['year']) + 10000*dfx['ccode']
        dfx[i] = dfx['value']
        dfx_limited = dfx[[i, 'yearcode']]
        joint_df = joint_df.join(dfx_limited.set_index('yearcode'), on='yearcode', how = 'inner')
    return joint_df

In [219]:
def upsampler(X_train, y_train, target = 'pt_attempt', ratio = 1.0):
    '''
    Args: X_train and y_train
    Optional: what is the target
    Returns: y_train, and X_train with the target rows sampled with replacement to equal 
    the number of non-target rows (makes X_train much bigger)
    '''
    y_train = pd.Series(y_train)
    
    X = pd.concat([X_train, y_train], axis=1) 
    no_coup = X[X[target]==0]
    coup = X[X[target]==1]
    coups_upsampled = resample(coup,
                          replace=True, # sample with replacement
                          n_samples=int(len(no_coup)*ratio), # match number in majority class
                          random_state=29)
    upsampled = pd.concat([no_coup, coups_upsampled])
    y_up = upsampled[target]
    X_up = upsampled.drop(target, axis = 1)
    return X_up, y_up

def metric_test(model, X_test, y_test):
    '''
    Prints out the accuracy, recall, precision, and f1 score for the 
    fit model when it predicts on the test data
    '''
    preds = model.predict(X_test)
    print('accuracy = ' + str(accuracy_score(y_test, preds)))
    print('recall = ' + str(recall_score(y_test, preds)))
    print('precision = ' + str(precision_score(y_test, preds)))
    print('f1 score = ' + str(f1_score(y_test, preds)))
    
def get_feature_weights(model, feature_labels):
    '''
    returns coefficients for features in a model (intended for logistic regression) 
    args: model, feature_labels
    returns: a sorted series in ascending order of feature weights.
    '''
    d_log_vals = {}
    for idx, feat in enumerate(model.coef_[0]):
        d_log_vals[feature_labels[idx]] = feat  
    s_log_vals = (pd.Series(d_log_vals)).sort_values()
    return s_log_vals

In [220]:
variable_list = ['Life expectancy at birth, female (years)', 'GDP growth (annual %)', 'Mineral rents (% of GDP)', 'Oil rents (% of GDP)', 'Trade (% of GDP)', 'Foreign direct investment, net inflows (% of GDP)', 'Natural gas rents (% of GDP)', 'Population ages 0-14 (% of total population)', 'Rural population (% of total population)',  'Population growth (annual %)', 'Arable land (hectares per person)',
 'Merchandise exports (current US$)',
 'Merchandise imports (current US$)',
 'Primary education, duration (years)']

In [221]:
wdi_df = pd.read_pickle('../data/wdi_complete.pkl')

In [222]:
reign_df = pd.read_pickle('../data/year_agg.pkl')
dummies = pd.get_dummies(reign_df['government'])
df_dumb = reign_df.join(dummies)
df_dumb['pt_attempt'] = df_dumb['coupyear']
df_dumb['pt_suc'] = df_dumb['coupsuc']
df = df_dumb.drop(['ccode', 'country', 'leader', 'month', 'government', 'coupyear', 'coupsuc'], axis = 1)

In [223]:
joint_df = add_wd_rows(df, wdi_df, variable_list)

In [225]:
joint_df.shape

(9954, 62)

In [226]:
joint_df_thinner.shape

(9954, 62)

In [227]:
joint_df_x = joint_df.dropna()

In [228]:
joint_df_x.shape

(5939, 62)

In [229]:
y = joint_df_x ['pt_attempt']
X = joint_df_x .drop(['pt_attempt','pt_suc', 'irregular'], axis = 1)

In [230]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)

In [234]:
ridge_scaled = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='l1', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.001,
        max_iter=200,)

In [235]:
X_up, y_up = upsampler(X_train, y_train, ratio = 1)

In [178]:
joint_df_2x = joint_df_thinner.drop(['direct_recent',
                                     'Merchandise imports (current US$)', 
                                     'Foreign direct investment, net inflows (% of GDP)', 
                                     'elected', 'Presidential Democracy'], axis =1)

In [179]:
joint_df_2 = joint_df_2x.dropna()

In [238]:
y = joint_df_2['pt_attempt']
X = joint_df_2.drop(['pt_attempt','pt_suc', 'irregular', 'Primary education, duration (years)'], axis = 1)
X['constant'] = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)
ridge = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='l1', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.001,
        max_iter=200,)
X_up, y_up = upsampler(X_train, y_train, ratio = 1)
ridgepipe = Pipeline([('scaler', StandardScaler()),('ridge', ridge)])
ridgepipe.fit(X_up, y_up)
metric_test(ridgepipe, X_test, y_test)
get_feature_weights(ridge, X.columns)

accuracy = 0.7782925215089345
recall = 0.8478260869565217
precision = 0.10626702997275204
f1 score = 0.18886198547215496


GDP growth (annual %)                          -0.753216
election_recent                                -0.668782
Monarchy                                       -0.444883
Life expectancy at birth, female (years)       -0.428184
Parliamentary Democracy                        -0.340441
Dominant Party                                 -0.315062
Foreign/Occupied                               -0.281219
year                                           -0.280108
male                                           -0.263977
Oligarchy                                      -0.208242
Merchandise exports (current US$)              -0.202342
Trade (% of GDP)                               -0.187121
loss                                           -0.151844
indirect_recent                                -0.124510
Warlordism                                     -0.119562
precip                                         -0.113439
election_now                                   -0.109256
Oil rents (% of GDP)           

In [239]:
revised_drops = ['age', 'tenure_months', 'Personal Dictatorship', 'exec_ant', 'leg_recent', 'nochange_recent', 'lastelection', 'lead_recent', 'victory_recent', 'ref_recent', 'irregular', 'Primary education, duration (years)', 'direct_recent',
                                     'Merchandise imports (current US$)', 
                                     'Foreign direct investment, net inflows (% of GDP)', 
                                     'elected', 'Presidential Democracy', 'yearcode']

In [240]:
revised_drops

['age',
 'tenure_months',
 'Personal Dictatorship',
 'exec_ant',
 'leg_recent',
 'nochange_recent',
 'lastelection',
 'lead_recent',
 'victory_recent',
 'ref_recent',
 'irregular',
 'Primary education, duration (years)',
 'direct_recent',
 'Merchandise imports (current US$)',
 'Foreign direct investment, net inflows (% of GDP)',
 'elected',
 'Presidential Democracy',
 'yearcode']

In [241]:
joint_df_3x = joint_df_thinner.drop(revised_drops, axis =1)

In [242]:
joint_df_3 = joint_df_3x.dropna()

In [243]:
joint_df_3['constant'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [244]:
joint_df_3.columns

Index(['year', 'male', 'militarycareer', 'anticipation', 'ref_ant', 'leg_ant',
       'irreg_lead_ant', 'election_now', 'election_recent', 'exec_recent',
       'indirect_recent', 'defeat_recent', 'change_recent', 'delayed', 'loss',
       'prev_conflict', 'precip', 'Dominant Party', 'Foreign/Occupied',
       'Indirect Military', 'Military', 'Military-Personal', 'Monarchy',
       'Oligarchy', 'Parliamentary Democracy', 'Party-Military',
       'Party-Personal', 'Party-Personal-Military Hybrid',
       'Provisional - Civilian', 'Provisional - Military', 'Warlordism',
       'pt_attempt', 'pt_suc', 'Life expectancy at birth, female (years)',
       'GDP growth (annual %)', 'Mineral rents (% of GDP)',
       'Oil rents (% of GDP)', 'Trade (% of GDP)',
       'Natural gas rents (% of GDP)',
       'Population ages 0-14 (% of total population)',
       'Rural population (% of total population)',
       'Population growth (annual %)', 'Arable land (hectares per person)',
       'Merchandis

In [249]:
y = joint_df_3['pt_attempt']
X = joint_df_3.drop(['pt_attempt','pt_suc'], axis = 1)
X['constant'] = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)
elastic_scaled = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='elasticnet', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.001,
        max_iter=200,
        l1_ratios = [0, .3, .5, .7, 1])
X_up, y_up = upsampler(X_train, y_train, ratio = 1)
elasticpipe = Pipeline([('scaler', StandardScaler()),('elastic_scaled', elastic_scaled)])
elasticpipe.fit(X_up, y_up)
metric_test(elasticpipe, X_test, y_test)
weights = get_feature_weights(elastic_scaled, X.columns)

accuracy = 0.9696169088507266
recall = 0.0
precision = 0.0
f1 score = 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [188]:
US_dict = {}
for val in X.columns:
    US_dict[val] = 0

In [189]:
joint_df_thinner[joint_df_thinner['yearcode'] == 22017]

Unnamed: 0,year,elected,age,male,militarycareer,tenure_months,anticipation,ref_ant,leg_ant,exec_ant,irreg_lead_ant,election_now,election_recent,leg_recent,exec_recent,lead_recent,ref_recent,direct_recent,indirect_recent,victory_recent,defeat_recent,change_recent,nochange_recent,delayed,lastelection,loss,irregular,prev_conflict,precip,yearcode,Dominant Party,Foreign/Occupied,Indirect Military,Military,Military-Personal,Monarchy,Oligarchy,Parliamentary Democracy,Party-Military,Party-Personal,Party-Personal-Military Hybrid,Personal Dictatorship,Presidential Democracy,Provisional - Civilian,Provisional - Military,Warlordism,pt_attempt,pt_suc,"Life expectancy at birth, female (years)",GDP growth (annual %),Mineral rents (% of GDP),Oil rents (% of GDP),Trade (% of GDP),"Foreign direct investment, net inflows (% of GDP)",Natural gas rents (% of GDP),Population ages 0-14 (% of total population),Rural population (% of total population),Population growth (annual %),Arable land (hectares per person),Merchandise exports (current US$),Merchandise imports (current US$),"Primary education, duration (years)"
76,2017.0,1.0,56.0,1,0.0,97.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.098612,1.098612,7.913887,1.0,0.447575,22017.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False,81.1,2.21701,0.08085,0.177276,27.14232,1.820076,0.0,18.858528,17.942,0.631008,,1546273000000.0,2408476000000.0,6.0
77,2017.0,1.0,71.0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.098612,1.098612,7.913887,1.0,0.447575,22017.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False,81.1,2.21701,0.08085,0.177276,27.14232,1.820076,0.0,18.858528,17.942,0.631008,,1546273000000.0,2408476000000.0,6.0


In [193]:
US_dict['year'] = 2020
US_dict['male']= 1
US_dict['lastelection']= 3.78419
US_dict['Presidential Democracy']= 1
US_dict['Life expectancy at birth, female (years)']= 81.1
US_dict['GDP growth (annual %)']= -5.0
US_dict['Mineral rents (% of GDP)']= 0.075769
US_dict['Oil rents (% of GDP)']= 0.09245,
US_dict['Trade (% of GDP)']= 27.543903
US_dict['Population ages 0-14 (% of total population)']= 18.70904
US_dict['Population growth (annual %)']= 0.522337
US_dict['constant'] = 1

    

In [196]:
US_dict

{'year': 2020,
 'male': 1,
 'militarycareer': 0,
 'anticipation': 0,
 'ref_ant': 0,
 'leg_ant': 0,
 'irreg_lead_ant': 0,
 'election_now': 0,
 'election_recent': 0,
 'exec_recent': 0,
 'indirect_recent': 0,
 'defeat_recent': 0,
 'change_recent': 0,
 'delayed': 0,
 'loss': 0,
 'prev_conflict': 0,
 'precip': 0,
 'Dominant Party': 0,
 'Foreign/Occupied': 0,
 'Indirect Military': 0,
 'Military': 0,
 'Military-Personal': 0,
 'Monarchy': 0,
 'Oligarchy': 0,
 'Parliamentary Democracy': 0,
 'Party-Military': 0,
 'Party-Personal': 0,
 'Party-Personal-Military Hybrid': 0,
 'Provisional - Civilian': 0,
 'Provisional - Military': 0,
 'Warlordism': 0,
 'Life expectancy at birth, female (years)': 81.1,
 'GDP growth (annual %)': -5.0,
 'Mineral rents (% of GDP)': 0.075769,
 'Oil rents (% of GDP)': (0.09245,),
 'Trade (% of GDP)': 27.543903,
 'Natural gas rents (% of GDP)': 0,
 'Population ages 0-14 (% of total population)': 18.70904,
 'Rural population (% of total population)': 0,
 'Population growth 

In [36]:
y_dfj2020 = dfj2020['pt_attempt']
X_dfj2020 = dfj2020.drop(['pt_attempt','pt_suc'], axis = 1)

In [38]:
X_dfj2020.columns

Index(['year', 'victory_recent', 'defeat_recent', 'change_recent',
       'nochange_recent', 'delayed', 'lastelection', 'loss', 'irregular',
       'prev_conflict', 'Life expectancy at birth, female (years)',
       'GDP growth (annual %)', 'Mineral rents (% of GDP)',
       'Oil rents (% of GDP)', 'Trade (% of GDP)',
       'Population ages 0-14 (% of total population)',
       'Population growth (annual %)', 'Dominant Party', 'Foreign/Occupied',
       'Indirect Military', 'Military', 'Military-Personal', 'Monarchy',
       'Oligarchy', 'Party-Personal', 'Presidential Democracy',
       'Provisional - Civilian', 'Constant'],
      dtype='object')

In [139]:
usj2020 = pd.DataFrame.from_dict(us2020dict)

In [141]:
elasticpipe.predict_proba(usj2020)

ValueError: operands could not be broadcast together with shapes (1,27) (43,) (1,27) 

In [137]:
us2020dict

{'year': [2020],
 'male': [1],
 'irreg_lead_ant': [0],
 'election_recent': [0],
 'victory_recent': [0],
 'nochange_recent': [0],
 'lastelection': [3.78419],
 'irregular': [7.928766],
 'yearcode': [22020],
 'Dominant Party': [0],
 'Foreign/Occupied': [0],
 'Indirect Military': [0],
 'Military': [0],
 'Military-Personal': [0],
 'Monarchy': [0],
 'Oligarchy': [0],
 'Party-Personal': [0],
 'Presidential Democracy': [1],
 'Provisional - Civilian': [0],
 'Life expectancy at birth, female (years)': [81.1],
 'GDP growth (annual %)': [-5.0],
 'Mineral rents (% of GDP)': [0.075769],
 'Oil rents (% of GDP)': [0.09245],
 'Trade (% of GDP)': [27.543903],
 'Population ages 0-14 (% of total population)': [18.70904],
 'Population growth (annual %)': [0.522337],
 'constant': [1]}

In [54]:
X[X['yearcode'] == 22017]

Unnamed: 0,year,male,irreg_lead_ant,election_recent,victory_recent,nochange_recent,lastelection,irregular,yearcode,Dominant Party,Foreign/Occupied,Indirect Military,Military,Military-Personal,Monarchy,Oligarchy,Party-Personal,Presidential Democracy,Provisional - Civilian,"Life expectancy at birth, female (years)",GDP growth (annual %),Mineral rents (% of GDP),Oil rents (% of GDP),Trade (% of GDP),Population ages 0-14 (% of total population),Population growth (annual %),constant
76,2017.0,1,0.0,1.0,0.0,0.0,1.098612,7.913887,22017.0,0,0,0,0,0,0,0,0,1,0,81.1,2.21701,0.08085,0.177276,27.14232,18.858528,0.631008,1
77,2017.0,1,0.0,1.0,0.0,0.0,1.098612,7.913887,22017.0,0,0,0,0,0,0,0,0,1,0,81.1,2.21701,0.08085,0.177276,27.14232,18.858528,0.631008,1


In [55]:
us2017dict = {'year': [2017], 
'male': [1], 
'irreg_lead_ant': [0],
'election_recent': [0], 
'victory_recent': [0],
'nochange_recent': [0], 
'lastelection': [1.098612],
'irregular': [7.913887],
'yearcode': [22017], 
'Dominant Party': [0],
'Foreign/Occupied': [0], 
'Indirect Military': [0], 
'Military': [0], 
'Military-Personal': [0],
'Monarchy':[0], 
'Oligarchy': [0],
'Party-Personal': [0],
'Presidential Democracy': [1],
'Provisional - Civilian': [0], 
'Life expectancy at birth, female (years)': [81.1], 
'GDP growth (annual %)': [2.21701], 
'Mineral rents (% of GDP)': [0.08085],
'Oil rents (% of GDP)': [0.177276], 
'Trade (% of GDP)': [27.14232],
'Population ages 0-14 (% of total population)': [18.858528], 
'Population growth (annual %)': [0.631008],
'constant': [1]}

In [45]:
usj2020 = pd.DataFrame.from_dict(us2020dict)

In [56]:
usj2017 = pd.DataFrame.from_dict(us2017dict)

In [46]:
logl1pipe.predict_proba(usj2020)

array([[0.89591695, 0.10408305]])

In [57]:
logl1pipe.predict_proba(usj2017)

array([[0.98261326, 0.01738674]])

In [62]:
ushypodict = {'year': [2017], 
'male': [1], 
'irreg_lead_ant': [0],
'election_recent': [0], 
'victory_recent': [0],
'nochange_recent': [0], 
'lastelection': [1.098612],
'irregular': [7.913887],
'yearcode': [22017], 
'Dominant Party': [0],
'Foreign/Occupied': [0], 
'Indirect Military': [0], 
'Military': [0], 
'Military-Personal': [0],
'Monarchy':[0], 
'Oligarchy': [0],
'Party-Personal': [0],
'Presidential Democracy': [1],
'Provisional - Civilian': [0], 
'Life expectancy at birth, female (years)': [61.1], 
'GDP growth (annual %)': [-10], 
'Mineral rents (% of GDP)': [0.08085],
'Oil rents (% of GDP)': [0.177276], 
'Trade (% of GDP)': [7.14232],
'Population ages 0-14 (% of total population)': [18.858528], 
'Population growth (annual %)': [0.631008],
'constant': [1]}

In [63]:
hypo = pd.DataFrame.from_dict(ushypodict)

In [64]:
logl1pipe.predict_proba(hypo)

array([[0.88841336, 0.11158664]])

In [47]:
X.shape

(6031, 27)

In [48]:
y = joint_df_3['pt_attempt']
X = joint_df_3.drop(['pt_attempt','pt_suc'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)


In [49]:
clf = RandomForestClassifier(max_depth=5, n_estimators=1000)
rfpipe = Pipeline([('scaler', StandardScaler()),('rf', clf)])
X_up, y_up = upsampler(X_train, y_train, ratio = 1)
rfpipe.fit(X_up, y_up)
metric_test(rfpipe, X_test, y_test)
#get_feature_weights(rfpipe, X.columns)

accuracy = 0.7974921630094044
recall = 0.673469387755102
precision = 0.09705882352941177
f1 score = 0.16966580976863754


In [110]:
X.columns

Index(['year', 'male', 'irreg_lead_ant', 'election_recent', 'victory_recent',
       'nochange_recent', 'lastelection', 'irregular', 'yearcode',
       'Dominant Party', 'Foreign/Occupied', 'Indirect Military', 'Military',
       'Military-Personal', 'Monarchy', 'Oligarchy', 'Party-Personal',
       'Presidential Democracy', 'Provisional - Civilian',
       'Life expectancy at birth, female (years)', 'GDP growth (annual %)',
       'Mineral rents (% of GDP)', 'Oil rents (% of GDP)', 'Trade (% of GDP)',
       'Population ages 0-14 (% of total population)',
       'Population growth (annual %)', 'constant'],
      dtype='object')

In [157]:
logl1pipe

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('ridge_scaled',
                 LogisticRegressionCV(Cs=10, class_weight=None, cv=5,
                                      dual=False, fit_intercept=True,
                                      intercept_scaling=1.0,
                                      l1_ratios=[0, 0.3, 0.5, 0.7, 1],
                                      max_iter=100, multi_class='warn',
                                      n_jobs=2, penalty='elasticnet',
                                      random_state=None, refit=True,
                                      scoring='recall', solver='saga',
                                      tol=0.0001, verbose=0))],
         verbose=False)

In [65]:
y = joint_df_3['pt_attempt']
X = joint_df_3.drop(['pt_attempt','pt_suc', 'irregular'], axis = 1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)
ridge_scaled = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='elasticnet', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,
        l1_ratios = [0, .3, .5, .7, 1])
X_up, y_up = upsampler(X_train, y_train, ratio = 1)
logl1pipe = Pipeline([('scaler', StandardScaler()),('ridge_scaled', ridge_scaled)])
logl1pipe.fit(X_up, y_up)
metric_test(logl1pipe, X_test, y_test)
get_feature_weights(ridge_scaled, X.columns)



accuracy = 0.7818181818181819
recall = 0.7551020408163265
precision = 0.09919571045576407
f1 score = 0.17535545023696683


GDP growth (annual %)                          -0.510773
Life expectancy at birth, female (years)       -0.466309
year                                           -0.286119
Monarchy                                       -0.251277
election_recent                                -0.251069
Trade (% of GDP)                               -0.238177
Dominant Party                                 -0.200396
Oligarchy                                      -0.114081
Oil rents (% of GDP)                           -0.106620
Foreign/Occupied                               -0.076581
male                                           -0.059179
Party-Personal                                 -0.057282
Presidential Democracy                          0.000000
constant                                        0.000000
yearcode                                        0.000000
nochange_recent                                 0.000000
victory_recent                                  0.000000
lastelection                   

In [None]:
clf = LogisticRegressionCV(
        cv=5, dual=False,
        penalty='elasticnet', 
        scoring='recall',
        solver='saga', 
        n_jobs = 2,
        tol=0.0001,
        max_iter=100,
        l1_ratios = [0, .3, .5, .7, 1])
logl1pipe = Pipeline([('scaler', StandardScaler()),('ridge_scaled', ridge_scaled)])
logl1pipe.fit(X_up, y_up)
metric_test(logl1pipe, X_test, y_test)
get_feature_weights(ridge_scaled, X.columns)

In [66]:
ushypodict = {'year': [2017], 
'male': [1], 
'irreg_lead_ant': [0],
'election_recent': [0], 
'victory_recent': [0],
'nochange_recent': [0], 
'lastelection': [1.098612],
'yearcode': [22017], 
'Dominant Party': [0],
'Foreign/Occupied': [0], 
'Indirect Military': [0], 
'Military': [0], 
'Military-Personal': [0],
'Monarchy':[0], 
'Oligarchy': [0],
'Party-Personal': [0],
'Presidential Democracy': [1],
'Provisional - Civilian': [0], 
'Life expectancy at birth, female (years)': [61.1], 
'GDP growth (annual %)': [-10], 
'Mineral rents (% of GDP)': [0.08085],
'Oil rents (% of GDP)': [0.177276], 
'Trade (% of GDP)': [7.14232],
'Population ages 0-14 (% of total population)': [18.858528], 
'Population growth (annual %)': [0.631008],
'constant': [1]}

In [67]:
hypo = pd.DataFrame.from_dict(ushypodict)
logl1pipe.predict_proba(hypo)

array([[0.47998802, 0.52001198]])

In [71]:
us2017dict = {'year': [2017], 
'male': [1], 
'irreg_lead_ant': [0],
'election_recent': [0], 
'victory_recent': [0],
'nochange_recent': [0], 
'lastelection': [1.098612],
'yearcode': [22017], 
'Dominant Party': [0],
'Foreign/Occupied': [0], 
'Indirect Military': [0], 
'Military': [0], 
'Military-Personal': [0],
'Monarchy':[0], 
'Oligarchy': [0],
'Party-Personal': [0],
'Presidential Democracy': [1],
'Provisional - Civilian': [0], 
'Life expectancy at birth, female (years)': [81.1], 
'GDP growth (annual %)': [2.21701], 
'Mineral rents (% of GDP)': [0.08085],
'Oil rents (% of GDP)': [0.177276], 
'Trade (% of GDP)': [27.14232],
'Population ages 0-14 (% of total population)': [18.858528], 
'Population growth (annual %)': [0.631008],
'constant': [1]}
us2017 = pd.DataFrame.from_dict(us2017dict)

logl1pipe.predict_proba(us2017)

array([[0.85641474, 0.14358526]])

In [73]:
us2020dict = {'year': [2020], 
'male': [1], 
'irreg_lead_ant': [0],
'election_recent': [0], 
'victory_recent': [0],
'nochange_recent': [0], 
'lastelection': [3.78419],
'yearcode': [22020], 
'Dominant Party': [0],
'Foreign/Occupied': [0], 
'Indirect Military': [0], 
'Military': [0], 
'Military-Personal': [0],
'Monarchy':[0], 
'Oligarchy': [0],
'Party-Personal': [0],
'Presidential Democracy': [1],
'Provisional - Civilian': [0], 
'Life expectancy at birth, female (years)': [81.1], 
'GDP growth (annual %)': [-5.0], 
'Mineral rents (% of GDP)': [0.075769],
'Oil rents (% of GDP)': [0.09245], 
'Trade (% of GDP)': [27.543903],
'Population ages 0-14 (% of total population)': [18.70904], 
'Population growth (annual %)': [0.522337],
'constant': [1]}
usj2020 = pd.DataFrame.from_dict(us2020dict)

In [74]:
logl1pipe.predict_proba(usj2020)

array([[0.78104445, 0.21895555]])

In [152]:
vscode_cols = ['year', 'male', 'militarycareer', 'anticipation', 'ref_ant', 'leg_ant',
       'irreg_lead_ant', 'election_now', 'election_recent', 'exec_recent',
       'indirect_recent', 'defeat_recent', 'change_recent', 'delayed', 'loss',
       'prev_conflict', 'precip', 'yearcode', 'Dominant Party',
       'Foreign/Occupied', 'Indirect Military', 'Military',
       'Military-Personal', 'Monarchy', 'Oligarchy', 'Parliamentary Democracy',
       'Party-Military', 'Party-Personal', 'Party-Personal-Military Hybrid',
       'Provisional - Civilian', 'Provisional - Military', 'Warlordism',
       'pt_attempt', 'pt_suc', 'Life expectancy at birth, female (years)',
       'GDP growth (annual %)', 'Mineral rents (% of GDP)',
       'Oil rents (% of GDP)', 'Trade (% of GDP)',
       'Natural gas rents (% of GDP)',
       'Population ages 0-14 (% of total population)',
       'Rural population (% of total population)',
       'Population growth (annual %)', 'Arable land (hectares per person)',
       'Merchandise exports (current US$)', 'constant']

In [154]:
len(joint_df_3.columns)

45

In [155]:
len(vscode_cols)

46