In [1]:
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, r2_score
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from statsmodels.regression.linear_model import OLS
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.inspection import plot_partial_dependence
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

In [35]:
def upsampler(X_train, y_train, target = 'pt_attempt'):
    '''
    Args: X_train and y_train
    Optional: what is the target
    Returns: y_train, and X_train with the target rows sampled with replacement to equal 
    the number of non-target rows (makes X_train much bigger)
    '''
    y_train = pd.Series(y_train)
    
    X = pd.concat([X_train, y_train], axis=1) 
    no_coup = X[X[target]==0]
    coup = X[X[target]==1]
    coups_upsampled = resample(coup,
                          replace=True, # sample with replacement
                          n_samples=len(no_coup), # match number in majority class
                          random_state=29)
    upsampled = pd.concat([no_coup, coups_upsampled])
    y_up = upsampled[target]
    X_up = upsampled.drop(target, axis = 1)
    return X_up, y_up

def metric_test(model, X_test, y_test):
    '''
    Prints out the accuracy, recall, precision, and f1 score for the 
    fit model when it predicts on the test data
    '''
    preds = model.predict(X_test)
    print('accuracy = ' + str(accuracy_score(y_test, preds)))
    print('recall = ' + str(recall_score(y_test, preds)))
    print('precision = ' + str(precision_score(y_test, preds)))
    print('f1 score = ' + str(f1_score(y_test, preds)))
    
def get_feature_weights(model, feature_labels):
    '''
    returns coefficients for features in a model (intended for logistic regression) 
    args: model, feature_labels
    returns: a sorted series in ascending order of feature weights.
    '''
    d_log_vals = {}
    for idx, feat in enumerate(model.coef_[0]):
        d_log_vals[feature_labels[idx]] = feat  
    s_log_vals = (pd.Series(d_log_vals)).sort_values()
    return s_log_vals

In [36]:
reign_df = pd.read_pickle('../data/year_agg.pkl')

In [37]:
dummies = pd.get_dummies(reign_df['government'])
df_dumb = reign_df.join(dummies)

In [38]:
df_dumb['pt_attempt'] = df_dumb['coupyear']
df_dumb['pt_suc'] = df_dumb['coupsuc']

In [39]:
df = df_dumb.drop(['ccode', 'country', 'leader', 'month', 'government', 'coupyear', 'coupsuc'], axis = 1)

In [40]:
df_early = df[df['year'] < 1975 ]

In [41]:
df_late = df[df['year'] >= 1975 ]

In [74]:
full_df_late = reign_df[reign_df['year'] >= 1975 ]

In [42]:
df_early

Unnamed: 0,year,elected,age,male,militarycareer,tenure_months,anticipation,ref_ant,leg_ant,exec_ant,irreg_lead_ant,election_now,election_recent,leg_recent,exec_recent,lead_recent,ref_recent,direct_recent,indirect_recent,victory_recent,defeat_recent,change_recent,nochange_recent,delayed,lastelection,loss,irregular,prev_conflict,precip,yearcode,Dominant Party,Foreign/Occupied,Indirect Military,Military,Military-Personal,Monarchy,Oligarchy,Parliamentary Democracy,Party-Military,Party-Personal,Party-Personal-Military Hybrid,Personal Dictatorship,Presidential Democracy,Provisional - Civilian,Provisional - Military,Warlordism,pt_attempt,pt_suc
0,1950.0,1.0,66.0,1,0.0,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.639057,5.327876,7.565793,0.0,-0.069058,21950.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False
1,1951.0,1.0,67.0,1,0.0,70.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.258097,5.384495,7.571989,0.0,-0.202572,21951.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False
2,1952.0,1.0,68.0,1,0.0,82.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.637586,5.438079,7.578145,0.0,0.394373,21952.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False
3,1953.0,1.0,69.0,1,0.0,94.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.098612,1.098612,7.584265,0.0,-0.665329,21953.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False
4,1953.0,1.0,63.0,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.098612,1.098612,7.584265,0.0,-0.665329,21953.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11318,1970.0,0.0,49.0,1,0.0,97.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.583519,4.584968,4.584968,0.0,-0.165010,9901970.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,False,False
11319,1971.0,0.0,49.0,1,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.484907,2.484907,4.700481,0.0,0.385177,9901971.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,False,False
11320,1972.0,0.0,50.0,1,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.178054,3.178054,4.804021,0.0,0.706669,9901972.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,False,False
11321,1973.0,0.0,51.0,1,0.0,36.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.583519,3.583519,4.897839,0.0,-0.043074,9901973.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,False,False


In [44]:
y = df_early['pt_attempt']
X = df_early.drop(['pt_attempt','pt_suc'], axis = 1)

In [45]:
clf = RandomForestClassifier(max_depth=5, n_estimators=1000)

In [46]:
rfpipe = Pipeline([('scaler', StandardScaler()),('rf', clf)])

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= .25, random_state= 40, stratify = y)

In [48]:
X_up, y_up = upsampler(X_train, y_train)

In [53]:
rfpipe.fit(X_up, y_up)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('rf',
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=5,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=1000, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)

In [54]:
metric_test(rfpipe, X_test, y_test)

accuracy = 0.7301808066759388
recall = 0.76
precision = 0.17272727272727273
f1 score = 0.28148148148148144


In [57]:
rfpipe.predict_proba(X)

array([[0.71642509, 0.28357491],
       [0.80012404, 0.19987596],
       [0.78373975, 0.21626025],
       ...,
       [0.55764111, 0.44235889],
       [0.56040559, 0.43959441],
       [0.56134135, 0.43865865]])

In [68]:
X_late = df_late.drop(['pt_attempt','pt_suc'], axis = 1)

In [69]:
preds = rfpipe.predict_proba(X_late)

In [65]:
len(df_late)

8493

In [70]:
len(preds[:,1])

8493

In [75]:
full_df_late['risk'] = preds[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [78]:
full_df_late.to_pickle('../data/pickles/df_late.pkl')