In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pydotplus
import seaborn as sns
from IPython.display import Image
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import StandardScaler
from sklearn.metrics import precision_score, recall_score, classification_report
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier,  export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

In [2]:
df = pd.read_csv('../Task1/df_after_dp.csv', parse_dates=['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187534 entries, 0 to 187533
Data columns (total 31 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   date                       187534 non-null  datetime64[ns]
 1   state                      187534 non-null  object        
 2   city_or_county             187534 non-null  object        
 3   latitude                   187534 non-null  float64       
 4   longitude                  187534 non-null  float64       
 5   congressional_district     187534 non-null  int64         
 6   avg_age_participants       187534 non-null  float64       
 7   n_participants_child       187534 non-null  int64         
 8   n_participants_teen        187534 non-null  int64         
 9   n_females                  187534 non-null  float64       
 10  n_killed                   187534 non-null  float64       
 11  n_injured                  187534 non-null  float64 

In [3]:
df['isKilled'] = np.where(df['n_killed'] > 0, 1, 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187534 entries, 0 to 187533
Data columns (total 32 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   date                       187534 non-null  datetime64[ns]
 1   state                      187534 non-null  object        
 2   city_or_county             187534 non-null  object        
 3   latitude                   187534 non-null  float64       
 4   longitude                  187534 non-null  float64       
 5   congressional_district     187534 non-null  int64         
 6   avg_age_participants       187534 non-null  float64       
 7   n_participants_child       187534 non-null  int64         
 8   n_participants_teen        187534 non-null  int64         
 9   n_females                  187534 non-null  float64       
 10  n_killed                   187534 non-null  float64       
 11  n_injured                  187534 non-null  float64 

In [4]:
def discretize_data(dataset, variables):
    for variable in variables:
        #get the unique variable's values
        var = sorted(dataset[variable].unique())
        
        #generate a mapping from the variable's values to the number representation  
        mapping = dict(zip(var, range(0, len(var) + 1)))

        #add a new colum with the number representation of the variable
        dataset[variable+'_num'] = dataset[variable].map(mapping).astype(int)
    return dataset

In [5]:
to_discretize = ['date', 'state', 'city_or_county', 'party', 'incident_characteristics1']
df = discretize_data(df, to_discretize)

In [6]:
df.drop(columns=['date', 'state', 'city_or_county', 'party', 'incident_characteristics1'], axis=1,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187534 entries, 0 to 187533
Data columns (total 32 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   latitude                       187534 non-null  float64
 1   longitude                      187534 non-null  float64
 2   congressional_district         187534 non-null  int64  
 3   avg_age_participants           187534 non-null  float64
 4   n_participants_child           187534 non-null  int64  
 5   n_participants_teen            187534 non-null  int64  
 6   n_females                      187534 non-null  float64
 7   n_killed                       187534 non-null  float64
 8   n_injured                      187534 non-null  float64
 9   n_arrested                     187534 non-null  float64
 10  n_unharmed                     187534 non-null  float64
 11  n_participants                 187534 non-null  float64
 12  povertyPercentage             

In [7]:
col_to_drop = ['latitude', 'longitude', 'n_killed', 'candidatevotes', 'totalvotes', 'incident_gravity', 'females_rate',
       'minor_rate', 'arrested_rate', 'survival_rate',  'killed_rate', 'injured_rate',
       'killed_disp_per_district', 'injured_disp_per_district',
       'part_disp_per_district', 'winning_party_percentage', 'n_injured','n_unharmed', "incident_characteristics1_num"]

df.drop(columns=col_to_drop, axis=1,inplace=True)

In [8]:
label = df.pop('isKilled')

In [9]:
X_trvl, X_test, y_trvl, y_test = train_test_split(df, label, test_size=0.30,random_state=10, stratify=label)

In [None]:
def print_metrics_cv(results_hp_search, model_name):
    index_best_model = results_hp_search.best_index_ 
    results_dict = results_hp_search.cv_results_
    metrics_list=['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
    if model_name == 'MLP':
        metrics_list.append('mse')
    print(f"CV best configuration for {model_name}:")
    print(f"best parameters {results_dict['params'][index_best_model]}")
    for i in metrics_list:        
        if i == 'mse':
            print(f'Mean {i} train set: {abs(results_dict[f"mean_train_{i}"][index_best_model])} +/- {results_dict[f"std_train_{i}"][index_best_model]}')
            continue
        print(f'Mean {i} train set: {results_dict[f"mean_train_{i}"][index_best_model]} +/- {results_dict[f"std_train_{i}"][index_best_model]}')
    print("\n")
    for i in metrics_list:
        if i == 'mse':
            print(f'Mean {i} train set: {abs(results_dict[f"mean_test_{i}"][index_best_model])} +/- {results_dict[f"std_test_{i}"][index_best_model]}')
            continue
        print(f'Mean {i} validation set: {results_dict[f"mean_test_{i}"][index_best_model]} +/- {results_dict[f"std_test_{i}"][index_best_model]}')
    print("\n")

In [None]:
# Function that prints the classification report
def print_report_score(test_label, test_pred):
    print(classification_report(test_label, 
                            test_pred, 
                            target_names=['NotKilled', 'isKilled']))

# Function that prints the confusion matrix
def print_confusion_matrix(test_label, pred_label, model):
    cm = confusion_matrix(test_label, pred_label)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
    disp.plot()
    plt.show()

In [None]:
# Function extracting each grid from dictionary of grids
def list_grids(grids_dict):
    return [grids_dict[item] for item in grids_dict]

# Function performing gridsearch cv according to sklearn
def do_sklearn_GridSearchCV(model_name,model,param_grid,scoring,refit,cv,return_train_score,n_jobs,X_encoded,y):
    hp_search = GridSearchCV(model,
                                param_grid=param_grid,
                                scoring=scoring,
                                refit=refit,
                                cv=cv,
                                return_train_score=return_train_score,
                                n_jobs=n_jobs,
                                verbose=10
                                ).fit(X_encoded, y)


    results = pd.DataFrame(hp_search.cv_results_)
    if not os.path.isdir(f"cv_results/"):
        os.mkdir(f"cv_results/")

    results.to_csv(f"cv_results/{model_name}_cv_results.csv")

    return hp_search

In [21]:
from interpret.glassbox import ExplainableBoostingClassifier

ebm = ExplainableBoostingClassifier()

ebm_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.05],
    'random_state': [10],
    'min_samples_leaf': [1, 2, 3],
    'max_leaves': [3, 5, 7]
    }

ebm_grid_search = do_sklearn_GridSearchCV('ebm', ebm, ebm_grid, ['accuracy', 'precision', 'recall', 'f1', 'roc_auc'],"accuracy", 5, True, 4, X_trvl, y_trvl)

ebm.fit(X_trvl, y_trvl)

ebm.predict(X_test)

ebm.score(X_test, y_test)


KeyboardInterrupt: 

In [11]:
from interpret import show

ebm_global = ebm.explain_global()
show(ebm_global)


In [15]:
ebm_local = ebm.explain_local(X_test[:10], y_test[:10])
show(ebm_local)

In [13]:
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_test, ebm.predict_proba(X_test)[:, 1])
print("AUC: {:.3f}".format(auc))

AUC: 0.830


In [14]:
#calculate accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, ebm.predict(X_test))
print("Accuracy: {:.3f}".format(accuracy))

Accuracy: 0.795


In [17]:
from interpret.perf import ROC

ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM Adult')

show(ebm_perf)

In [None]:
from aix360.metrics import faithfulness_metric, monotonicity_metric

predicted_class = ebm.predict(X_test.values[0].reshape(1,-1))[0]

le = exp.local_exp[predicted_class]

m = exp.as_map()

x = X_test.values[0]
coefs = np.zeros(x.shape[0])

for v in le:
    coefs[v[0]] = v[1]


base = np.zeros(x.shape[0])


print("Faithfulness: ", faithfulness_metric(bb, x, coefs, base))
print("Monotonity: ", monotonicity_metric(bb, x, coefs, base))