In [51]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from statsmodels.regression.linear_model import OLS
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.inspection import plot_partial_dependence
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

In [40]:

def scaler(X_train, X_test, minmax = False):
    '''
    Arguments: X_train and X_test data
    Optional: minmax (scale between 0 and 1)

    Returns: X_train and X_test either standardized by demeaning, or scaled to 1
    '''
    if minmax == True:
        scaler = MinMaxScaler()
    else:
        scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# def get_indices(df):
#     X_full_indices = df.drop(['pt_suc', 'pt_attempt'], axis =1 ).columns
#     return X_full_indices


def splitter(df, target = 'pt_attempt', test_size = .25, random_state = 29, VIF_drop = False, scaled = False, minmax = False):
    '''
    Arguments: The dataframe
    Optional Args: test size, random state, whether to drop a list (determined by VIF correlations)
    whether to scale, whether to use minmax scaling (between 0 and 1)

    Returns:
    X_train, X_test, y_train, y_test, and the feature labels for the columns in X
    '''
    
    _targets = ['pt_attempt', 'pt_suc']
    if VIF_drop == True:
        df = df.drop(vifdrops, axis = 1)
        y = df[target]
        X = df.drop(_targets, axis = 1)
    if VIF_drop == False:
        y = df[target]
        X = df.drop(_targets, axis = 1)
    colnames = X.columns
    idx = colnames.to_numpy()
    feature_labels = np.concatenate((['constant'], idx) )
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= test_size, random_state= random_state, stratify = y )
    if scaled == True:
        X_train, X_test = scaler(X_train, X_test, minmax = minmax)
    X_train = add_constant(X_train)    
    X_test =  add_constant(X_test)    
    return X_train, X_test, y_train, y_test, feature_labels

def upsampler(X_train, y_train, target = 'pt_attempt'):
    '''
    Args: X_train and y_train
    Optional: what is the target
    Returns: y_train, and X_train with the target rows sampled with replacement to equal 
    the number of non-target rows (makes X_train much bigger)
    '''
    X = pd.concat([X_train, y_train], axis=1) 
    no_coup = X[X[target]==0]
    coup = X[X[target]==1]
    coups_upsampled = resample(coup,
                          replace=True, # sample with replacement
                          n_samples=len(no_coup), # match number in majority class
                          random_state=29)
    upsampled = pd.concat([no_coup, coups_upsampled])
    y_up = upsampled[target]
    X_up = upsampled.drop(target, axis = 1)
    return X_up, y_up

def downsampler(X_train, y_train, target = 'pt_attempt'):
    '''
    Args: X_train and y_train
    Optional: what is the target
    Returns: y_train, and X_train with the non-target rows sampled with replacement to equal 
    the number of target rows (makes X_train much smaller)

    '''
    X = pd.concat([X_train, y_train], axis=1) 
    no_coup = X[X[target]==0]
    coup = X[X[target]==1]
    coups_downsampled = resample(no_coup,
                          replace=True, # sample with replacement
                          n_samples=len(coup), # match number in majority class
                          random_state=29)
    downsampled = pd.concat([coup, coups_downsampled])
    y_down = downsampled[target]
    X_down = downsampled.drop(target, axis = 1)
    return X_down, y_down

def smoter(X_train, y_train, ratio = 1.0):
    '''
    Args: X_train and y_train
    Optional: ratio
    Returns: y_train, and X_train with new target rows synthetically added to equal 
    the number of target rows (makes X_train much smaller) (or a different)
    '''
    sm = SMOTE(random_state=29, ratio=ratio)
    X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)
    return X_train_sm, y_train_sm



def data_pipeline(df, target = 'pt_attempt', test_size = .25, random_state = 29, VIF_drop = False, scaled = False, minmax = False, resampler = None, sample_ratio = 1):
    '''
    Processes the onehot encoded dataframe to prepare it for modelling, with optional arguments 
    to drop collinear columns, resample, and scale.

    Args: dataframe, 
    optional: target columns, ratio for test train split, random state,
    whether to drop the vif_list, whether to scale, whether to use minmax (only makes sense if scaled = True),
    whether to resample, and what ratio to resample at (only currenly implemented with SMOTE)
    R

    '''
    X_train, X_test, y_train, y_test, feature_labels = splitter(df, target = 'pt_attempt', test_size = .25, 
                                                random_state = 29, VIF_drop = VIF_drop, scaled = scaled, minmax = minmax)
    if resampler == 'upsample':
        X_train, y_train = upsampler(X_train, y_train)
    if resampler == 'downsample':
        X_train, y_train = downsampler(X_train, y_train)
    if resampler == 'smote':
        X_train, y_train = smoter(X_train, y_train, ratio = sample_ratio)
    return X_train, X_test, y_train, y_test, feature_labels

def get_feature_weights(model, feature_labels):
    '''
    returns coefficients for features in a model (intended for logistic regression) 
    args: model, feature_labels
    returns: a sorted series in ascending order of feature weights.
    '''
    d_log_vals = {}
    for idx, feat in enumerate(model.coef_[0]):
        d_log_vals[feature_labels[idx]] = feat  
    s_log_vals = (pd.Series(d_log_vals)).sort_values()
    return s_log_vals

def metric_test(model, X_test, y_test):
    '''
    Prints out the accuracy, recall, precision, and f1 score for the 
    fit model when it predicts on the test data
    '''
    preds = model.predict(X_test)
    print('accuracy = ' + str(accuracy_score(y_test, preds)))
    print('recall = ' + str(recall_score(y_test, preds)))
    print('precision = ' + str(precision_score(y_test, preds)))
    print('f1 score = ' + str(f1_score(y_test, preds)))
    #print('r2_score = ' + str(r2_score(y_test, preds)))


def fit_test_model(model, X_train, X_test, y_train, y_test, indices, do_metric_test = True, get_features = False):
    '''
    fits a model to the training data, with the option argument to print out the feature weights
    '''
    model.fit(X_train, y_train)
    if do_metric_test == True:
        metric_test(model, X_test, y_test)
    if get_features == True:
        features = get_feature_weights(model, indices)
        print(features)
    return model

def variance_inflation_factors(X):
    '''
    calculates VIF values for the X dataset, inteded to be used iteratively to reduce 
    collinearity by dropping values from X and rechecking the values
    '''
    # X = add_constant(X)
    vifs = pd.Series(
        [1 / (1. - OLS(X[col].values, 
                       X.loc[:, X.columns != col].values).fit().rsquared) 
         for col in X],
        index=X.columns,
        name='VIF'
    )
    return vifs.sort_values()

In [21]:
df = pd.read_pickle('../data/year_agg_dum.pkl')

In [22]:
df.columns

Index(['ccode', 'country', 'leader', 'year', 'month', 'elected', 'age', 'male',
       'militarycareer', 'tenure_months', 'government', 'anticipation',
       'ref_ant', 'leg_ant', 'exec_ant', 'irreg_lead_ant', 'election_now',
       'election_recent', 'leg_recent', 'exec_recent', 'lead_recent',
       'ref_recent', 'direct_recent', 'indirect_recent', 'victory_recent',
       'defeat_recent', 'change_recent', 'nochange_recent', 'delayed',
       'lastelection', 'loss', 'irregular', 'prev_conflict', 'precip',
       'yearcode', 'coupyear', 'coupsuc', 'Dominant Party', 'Foreign/Occupied',
       'Indirect Military', 'Military', 'Military-Personal', 'Monarchy',
       'Oligarchy', 'Party-Military', 'Party-Personal',
       'Party-Personal-Military Hybrid', 'Personal Dictatorship',
       'Presidential Democracy', 'Provisional - Civilian',
       'Provisional - Military', 'Warlordism'],
      dtype='object')

In [23]:
df['pt_attempt'] = df['coupyear']

In [24]:
df['pt_suc'] = df['coupsuc']

In [25]:
df = df.drop(['coupyear', 'coupsuc', 'ccode', 'country', 'leader', 'year', 'month', 'government'], axis =1)

In [26]:
df

Unnamed: 0,elected,age,male,militarycareer,tenure_months,anticipation,ref_ant,leg_ant,exec_ant,irreg_lead_ant,...,Party-Military,Party-Personal,Party-Personal-Military Hybrid,Personal Dictatorship,Presidential Democracy,Provisional - Civilian,Provisional - Military,Warlordism,pt_attempt,pt_suc
0,1.0,66.0,1,0.0,58.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,False,False
1,1.0,67.0,1,0.0,70.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,False,False
2,1.0,68.0,1,0.0,82.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,False,False
3,1.0,69.0,1,0.0,94.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,False,False
4,1.0,63.0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11364,1.0,71.0,1,0.0,206.0,1.0,0.0,1.0,0.0,0.0,...,0,0,0,0,0,0,0,0,False,False
11365,1.0,72.0,1,0.0,218.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,False,False
11366,1.0,73.0,1,0.0,230.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,False,False
11367,1.0,74.0,1,0.0,242.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,False,False


In [27]:
df.columns

Index(['elected', 'age', 'male', 'militarycareer', 'tenure_months',
       'anticipation', 'ref_ant', 'leg_ant', 'exec_ant', 'irreg_lead_ant',
       'election_now', 'election_recent', 'leg_recent', 'exec_recent',
       'lead_recent', 'ref_recent', 'direct_recent', 'indirect_recent',
       'victory_recent', 'defeat_recent', 'change_recent', 'nochange_recent',
       'delayed', 'lastelection', 'loss', 'irregular', 'prev_conflict',
       'precip', 'yearcode', 'Dominant Party', 'Foreign/Occupied',
       'Indirect Military', 'Military', 'Military-Personal', 'Monarchy',
       'Oligarchy', 'Party-Military', 'Party-Personal',
       'Party-Personal-Military Hybrid', 'Personal Dictatorship',
       'Presidential Democracy', 'Provisional - Civilian',
       'Provisional - Military', 'Warlordism', 'pt_attempt', 'pt_suc'],
      dtype='object')

In [36]:
X_train, X_test, y_train, y_test, indices = data_pipeline(df, target = 'pt_attempt', test_size = .25, random_state = 30, VIF_drop = False, scaled = False, minmax = False, resampler = 'downsample', sample_ratio = 1)


  return ptp(axis=axis, out=out, **kwargs)


In [37]:
clf = RandomForestClassifier( n_estimators = 1000, max_depth = 3)

In [38]:
clf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=3, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [41]:
metric_test(clf, X_test, y_test)

accuracy = 0.6778051354203306
recall = 0.7289719626168224
precision = 0.08082901554404145
f1 score = 0.1455223880597015


In [47]:
scores = clf.predict_proba(X_test)[:,1]

In [48]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, scores)

In [49]:
fpr

array([0.00000000e+00, 3.65497076e-04, 1.82748538e-03, 1.82748538e-03,
       3.28947368e-03, 3.28947368e-03, 6.21345029e-03, 6.21345029e-03,
       9.13742690e-03, 9.13742690e-03, 1.05994152e-02, 1.05994152e-02,
       1.09649123e-02, 1.09649123e-02, 1.20614035e-02, 1.20614035e-02,
       1.27923977e-02, 1.27923977e-02, 1.46198830e-02, 1.46198830e-02,
       1.49853801e-02, 1.49853801e-02, 1.79093567e-02, 1.79093567e-02,
       2.01023392e-02, 2.01023392e-02, 2.30263158e-02, 2.30263158e-02,
       2.66812865e-02, 2.66812865e-02, 2.77777778e-02, 2.77777778e-02,
       3.10672515e-02, 3.10672515e-02, 3.39912281e-02, 3.39912281e-02,
       3.80116959e-02, 3.80116959e-02, 3.87426901e-02, 3.87426901e-02,
       4.05701754e-02, 4.05701754e-02, 4.13011696e-02, 4.13011696e-02,
       4.38596491e-02, 4.38596491e-02, 4.45906433e-02, 4.45906433e-02,
       4.89766082e-02, 4.89766082e-02, 5.29970760e-02, 5.29970760e-02,
       5.37280702e-02, 5.37280702e-02, 5.51900585e-02, 5.51900585e-02,
      

In [53]:
from sklearn.metrics import roc_auc_score

In [56]:
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = 2
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])


# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

NameError: name 'roc_curve' is not defined

In [54]:
lw = 2
plt.plot(fpr[1], tpr[1], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

NameError: name 'roc_auc' is not defined