In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from scipy.stats import pearsonr
from sklearn.linear_model import LogisticRegression 
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import f_classif, chi2, mutual_info_classif, SelectKBest
import pprint
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, auc

In [3]:
data = pd.read_csv('new_data.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,process_days,process_day_>=1,Vulnerability Threshold,Field:Aggr 3 Threats Since Moving,Field:US Prior Admission,Field:Aggr 3 Moved to Avoid Aggr,Field:Family Worked for USG,Field:RFR,Field:Aggr 2 Moved to Avoid Aggr,...,Field:Aggr 3 Threat Within Last 6 Months,Field:Women and Girls Severity Assessment,Field:DAPEV Info Rating,Field:Canadian Sponsorship Info Rating,Field:Aggr 1 Reason Unable to Move,Field:Nationality2,Field:Email Address1,Field:I 130 F,Field:Worked for USG,Field:Aggr 1 Mst Svr Only Threats
0,18,6.0,1.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,25,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,32,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,33,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,34,0.0,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
x = data.drop(columns=['Unnamed: 0', 'process_days', 'process_day_>=1', 'Vulnerability Threshold'])
y = data['Vulnerability Threshold']

In [10]:
var = y.name

In [25]:
data['Vulnerability Threshold'].value_counts()

0    2133
1      44
Name: Vulnerability Threshold, dtype: int64

In [19]:
from sklearn.utils import resample

In [22]:
def upsample(x,y):
    
    # Find target name
    var= y.name
    
    df = pd.concat([x, y], axis=1)
    
    # Find minority & majority class
    if df[df[var]==0].shape[0] > df[df[var]==1].shape[0]:
        df_majority = df[df[var]==0]
        df_minority = df[df[var]==1]
    else:
        df_majority = df[df[var]==1]
        df_minority = df[df[var]==0]

    # Upsample minority class
    df_minority_upsampled = resample(df_minority, 
                                     replace=True,     # sample with replacement
                                     n_samples=df_majority.shape[0],    # to match majority class
                                     random_state=123) # reproducible results

    # Combine majority class with upsampled minority class
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])

    # Display new class counts
    print(df_upsampled[var].value_counts())
    
    new_x = df_upsampled.drop(var,axis=1)
    new_y = df_upsampled[var]
    
    return new_x, new_y

In [43]:
def downsample(x,y):
    
    # Find target name!
    var= y.name
    
    df = pd.concat([x, y], axis=1)
    
    # Find minority & majority class
    if df[df[var]==0].shape[0] > df[df[var]==1].shape[0]:
        df_majority = df[df[var]==0]
        df_minority = df[df[var]==1]
    else:
        df_majority = df[df[var]==1]
        df_minority = df[df[var]==0]
    
    
#     if size == None:
#         n = df_minority.shape[0] 
#     else: n = size
    
    # Upsample minority class
    df_majority_downsampled = resample(df_majority, 
                                     replace=True,     # sample with replacement
                                     n_samples=df_minority.shape[0] ,    # to match majority class
                                     random_state=123) # reproducible results

    # Combine majority class with upsampled minority class
    df_downsampled = pd.concat([df_minority, df_majority_downsampled])

    # Display new class counts
    print(df_downsampled[var].value_counts())
    
    new_x = df_downsampled.drop(var,axis=1)
    new_y = df_downsampled[var]
    
    return new_x, new_y

In [44]:
a, b = downsample(x,y)

1    44
0    44
Name: Vulnerability Threshold, dtype: int64


### Training

In [66]:
x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    test_size=0.3,
                                                    random_state=123)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1523, 69)
(654, 69)
(1523,)
(654,)


In [67]:
x_train, y_train = upsample(x_train, y_train)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

1    1488
0    1488
Name: Vulnerability Threshold, dtype: int64
(2976, 69)
(654, 69)
(2976,)
(654,)


In [48]:
def get_feat(method, k, x, y):
    select = SelectKBest(method, k)
    _ = select.fit_transform(x, y)
    return [x.columns[i] for i, val in enumerate(select.get_support()) if val] 

In [157]:
num_features = 25
chi2_select = get_feat(chi2, num_features, x_train, y_train)
f_select = get_feat(f_classif, num_features, x_train, y_train)
all_feat = x.columns

  f = msb / msw


In [165]:
for i in [all_feat,chi2_select,f_select]:
    model = LogisticRegression()
    model.fit(x_train[i],y_train)
    print("test accuracy:", model.score(x_test[i],y_test))
    
    y_pred = model.predict(x_test[i])
    cm = confusion_matrix(y_test, y_pred,labels=[1, 0])
    tpr = cm[0][0]/np.sum(cm[0])
    tnr = cm[1][1]/np.sum(cm[1])
    
    print(cm)
    print("True positive rate:", tpr)
    print("True negative rate:", tnr)

    pprint.pprint(sorted(list(zip(model.coef_[0],x[i].columns)), reverse=True, key=lambda t: abs(t[0])))
    print("\n"+"-"*100)

test accuracy: 0.22782874617737003
[[  8   1]
 [504 141]]
True positive rate: 0.8888888888888888
True negative rate: 0.2186046511627907
[(-3.0182229837706003, 'Field:Women and Girls at Risk'),
 (-2.992549650071295, 'Field:Aggr 2 Moved to Avoid Aggr'),
 (2.386811800405464, 'Field:Aggr 2 Mst Svr Only Threats'),
 (2.2100315325798263, 'Field:I 130 IR'),
 (2.2100315325798263, 'Field:EO Waiver'),
 (-2.123599288363052, 'Married'),
 (2.0945013262753402, 'Field:Aggr 1 Attack Within Last 6 Months'),
 (-2.035336047215183, 'Field:Lack of Durable Solutions'),
 (-1.9090676017947987, 'Field:Aggr 1 Reason Unable to Move'),
 (-1.7820395357680154, 'Field:Traumatic Incident 2 Assessment'),
 (1.7047183104959287, 'Field:Aggr 2 Mst Svr Attack'),
 (1.6786387594709715, 'Field:Aggr 2 Reason Unable to Move'),
 (1.5955862196193045, 'Field:Aggr 2 SVT Rating'),
 (-1.568282545747491, 'Field:Canadian Sponsorship Info Rating'),
 (-1.4601799022446438, 'Field:Medical Condition 2 Assessment'),
 (1.4080866382180623, 'Fie

In [113]:
for i in [all_feat]:
    model = RandomForestClassifier(max_depth=10,n_estimators = 64,criterion='gini',random_state=123)
    model.fit(x_train[i],y_train)
    print("test accuracy:", model.score(x_test[i],y_test))
    
    y_pred = model.predict(x_test[i])
    cm = confusion_matrix(y_test, y_pred,labels=[1, 0])
    tpr = cm[0][0]/np.sum(cm[0])
    tnr = cm[1][1]/np.sum(cm[1])
    
    print(cm)
    print("True positive rate:", tpr)
    print("True negative rate:", tnr)

    pprint.pprint(sorted(list(zip(model.feature_importances_,x[i].columns)), reverse=True))

test accuracy: 0.24617737003058104
[[  8   1]
 [492 153]]
True positive rate: 0.8888888888888888
True negative rate: 0.2372093023255814
[(0.057362557809974214, 'Field:Women and Girls at Risk'),
 (0.05712977802438734, 'Field:Aggr 2 SVT Rating'),
 (0.04826432682228885, 'Field:End Feedback Follow Up Conversation'),
 (0.045422487209182225, 'Field:Aggr 2 Threat Within Last 6 Months'),
 (0.04063903830498758, 'Field:Traumatic Incident 2 Assessment'),
 (0.039633270461623064, 'Field:Aggr 1 Moved to Avoid Aggr'),
 (0.03918332645463393, 'Field:Vulnerability Assessed'),
 (0.03773456818356787, 'Married'),
 (0.036317489740278035, 'Field:Lack of Durable Solutions'),
 (0.035326153142035806, 'Field:Aggr 2 Mst Svr Only Threats'),
 (0.0351032982044298, 'Field:Aggr 2 Mst Svr Attack'),
 (0.03351490989431306, 'Field:Canadian Sponsorship Info Rating'),
 (0.03072375102815407, 'Field:German Reunification Info Rating'),
 (0.03034537479788532, 'Field:Multiple Incidents'),
 (0.03027687868452229, 'Field:Prescreeni