In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_curve
  
df = pd.read_csv('shared/complaints_25Nov21.csv')
df.fillna('missing', inplace=True)

# step2
X = df[['Product', 'Sub-product', 'Issue', 'State', 'Tags', 'Submitted via', 'Company response to consumer', 'Timely response?']]
y = df['Consumer disputed?']
X = pd.get_dummies(X)

# step2.b
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

# step3
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# step4
if y_train.mean() < 0.3:
    undersampler = RandomUnderSampler(random_state=123)
    X_train, y_train = undersampler.fit_resample(X_train, y_train)

In [2]:
# step5
model_xgb = XGBClassifier(random_state=123)
model_xgb.fit(X_train, y_train)
y_pred = model_xgb.predict(X_test)

# step6
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.53      0.65     32504
           1       0.27      0.63      0.38      8948

    accuracy                           0.55     41452
   macro avg       0.55      0.58      0.51     41452
weighted avg       0.72      0.55      0.59     41452

[[17128 15376]
 [ 3302  5646]]


In [3]:
# step7 q4
base_case_cost = y_test.sum() * 600 + (len(y_test) - y_test.sum()) * 100
print("base cost:", base_case_cost)

# step8 q5
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
model_cost = tp * 190 + tn * 100 + fp * 190 + fn * 600
print("model cost:", model_cost)

base cost: 8619200
model cost: 7688180


In [4]:
# step9 q7
probs = model_xgb.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, probs)
thresholds = np.append(thresholds, 1)

costs = []
for threshold in thresholds:
    y_pred_adj = (probs >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_adj).ravel()
    cost = tp * 190 + tn * 100 + fp * 190 + fn * 600
    costs.append(cost)

min_cost_threshold = thresholds[np.argmin(costs)]
print("Threshold with minimum cost:", min_cost_threshold)

Threshold with minimum cost: 0.4418143033981323


In [5]:
#q6 
y_pred_adj = (probs >= min_cost_threshold).astype(int)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_adj).ravel()
cost = tp * 190 + tn * 100 + fp * 190 + fn * 600
print("adjusted threshold cost:", cost)

adjusted threshold cost: 7605970


In [6]:
#q1
proportion_dispute_test = y_test.mean()
proportion_dispute_test

0.21586413200810575

In [7]:
#q2
proportion_dispute_train_undersampled = y_train.mean()
proportion_dispute_train_undersampled

0.5

In [8]:
#q3
from sklearn.metrics import recall_score
recall = recall_score(y_test, y_pred)
recall

0.6309789897183729