In [None]:
'''import data science packages pandas, numpy, matplotlib, seaborn, and sklearn'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

'''import plotly'''
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, r2_score, roc_curve, auc, roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold 
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
import warnings, gc
import matplotlib.colors

from scipy import stats


warnings.filterwarnings("ignore")
# init_notebook_mode(connected=True)


In [None]:
'''import data from csv file'''
train = pd.read_feather('C:/Users/tyler/Desktop/Kaggle/Amex Feather Data/train_data.ftr')
#Subsample data
#train = train.sample(frac=0.1, random_state=42)
test = pd.read_feather('C:/Users/tyler/Desktop/Kaggle/Amex Feather Data/test_data.ftr')
#Subsample data
#test = test.sample(frac=0.1, random_state=42)

In [None]:
'''function that changes S_2 etc to Spend 2 etc'''
def change_col_name(df):
    df.columns = [x.replace('S_', 'Spend ') if x.startswith('S_') else x for x in df.columns]
    df.columns = [x.replace('R_', 'Risk ') if x.startswith('R_') else x for x in df.columns]
    df.columns = [x.replace('P_', 'Payment ') if x.startswith('P_') else x for x in df.columns]
    df.columns = [x.replace('D_', 'Delinquency ') if x.startswith('D_') else x for x in df.columns]
    df.columns = [x.replace('B_', 'Balance ') if x.startswith('B_') else x for x in df.columns]
    df.columns = [x.replace('target', 'Target') if x.startswith('target') else x for x in df.columns]
    return df

change_col_name(train)
change_col_name(test)

In [None]:
cat_cols=['Balance 30', 'Balance 38', 'Delinquency 63', 'Delinquency 64', 'Delinquency 66', 'Delinquency 68',
          'Delinquency 114', 'Delinquency 116', 'Delinquency 117', 'Delinquency 120', 'Delinquency 126']
          
num_cols = [col for col in train.columns if col not in cat_cols]

In [None]:
enc = LabelEncoder()
for col in cat_cols[:-1]:
    train[col] = enc.fit_transform(train[col])
    test[col] = enc.transform(test[col])

In [None]:
#Count of Column Types
train.dtypes.value_counts()

In [None]:
for col in cat_cols:
    print(col, train[col].nunique())

In [None]:
#count of null values


In [None]:
for col in cat_cols:
    print(col, train[col].isnull().sum())

In [None]:
num_cols

In [None]:
del train['Spend 2']
# del test['Spend 2']
del train['customer_ID']
del num_cols[-1:]
del num_cols[:2]
num_cols

In [None]:
train[cat_cols]

In [None]:
train[cat_cols].nunique()

In [None]:
#Cast all categorical columns to string
for col in cat_cols:
    train[col] = train[col].astype(str)

#Cast all categorical columns to string
# for col in cat_cols:
#     test[col] = test[col].astype(str)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing_value')),
    ('one hot encode', OneHotEncoder(handle_unknown='ignore'))])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
])

In [None]:
#Run categorical transformer on data
ohe_df = categorical_transformer.fit_transform(train[cat_cols])


In [None]:
#memory size of one_df in MB
ohe_df.data.nbytes / 1024**2

In [None]:
#memory size of train in MB
train.memory_usage().sum() / 1024**2

In [None]:
transformed_num_cols = numeric_transformer.fit_transform(train[num_cols])

In [None]:
transformed_num_cols

In [None]:
#Combine transformed_num_cols and ohe_df
train_transformed = np.concatenate([transformed_num_cols, ohe_df.toarray()], axis=1)

In [None]:
train_transformed = pd.DataFrame(train_transformed)

In [None]:
train_transformed

In [None]:
#memory usage of train_transformed in MB
train_transformed.memory_usage().sum() / 1024**2

In [None]:
y = train['Target']
X = train_transformed

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

logreg = LogisticRegression(solver='sag', verbose=1, n_jobs=-1, random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(verbose=1, n_jobs=-1)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print('Accuracy of Random Forest Classifier on test set: {:.2f}'.format(rfc.score(X_test, y_test)))


In [None]:
#load test data and preprocess
# test = pd.read_feather('C:/Users/tyler/Desktop/Kaggle/Amex Feather Data/test_data.ftr')
# change_col_name(test)
# del test['Spend 2']
# del test['customer_ID']
# test[cat_cols] = test[cat_cols].astype(str)
# test_transformed = preprocessor.transform(test)
# test_transformed = pd.DataFrame(test_transformed)
# test_transformed


**BELOW IS MODELS WITH NO IMPUTATION


In [None]:

cat_cols=['Balance 30', 'Balance 38', 'Delinquency 63', 'Delinquency 64', 'Delinquency 66', 'Delinquency 68',
          'Delinquency 114', 'Delinquency 116', 'Delinquency 117', 'Delinquency 120', 'Delinquency 126', 'Target']

In [None]:
'''Select numerical columns with no missing values and not in cat_cols'''
num_cols = [col for col in train.columns if train[col].dtype != 'object' and train[col].isnull().sum() == 0 and col not in cat_cols]
num_cols.remove('Spend 2')
len(num_cols)

In [None]:
'''train test split sklearn'''
X = train[num_cols]
y = train['Target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#Logistic Regression with validation set
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))



In [None]:
#ROC Curve
# calculate the fpr and tpr for all thresholds of the classification
probs = logreg.predict_proba(X_test)
preds = probs[:,1]
fpr, tpr, threshold = roc_curve(y_test, preds)
roc_auc = auc(fpr, tpr)
#Plot roc_curve
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')

In [None]:
#Plot confusion matrix of logistic regression using plotly
import plotly.figure_factory as ff
cm = confusion_matrix(y_test, y_pred)
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm = np.around(cm, decimals=2)
fig = ff.create_annotated_heatmap(cm, x=['Predicted 0', 'Predicted 1'], y=['Actual 0', 'Actual 1'], colorscale='Viridis')
fig.update_layout(title_text='Confusion Matrix')

In [None]:
#Run logisitc regression with normalization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
Scaled_X_train = scaler.transform(X_train)
Scaled_X_test = scaler.transform(X_test)
logreg.fit(Scaled_X_train, y_train)
y_pred = logreg.predict(Scaled_X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(Scaled_X_test, y_test)))


In [None]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(verbose=1, n_jobs=-1)
rfc.fit(X_train, y_train)
rfc_pred = rfc.predict(X_test)
print('Accuracy of Random Forest Classifier on test set: {:.2f}'.format(rfc.score(X_test, y_test)))


In [None]:
#Make ensemble of logistic regression and random forest classifier
ensemble = (logreg.predict_proba(X_test) + rfc.predict_proba(X_test))/2
ensemble_pred = np.argmax(ensemble, axis=1)
#Find Accuracy of ensemble
print('Accuracy of Ensemble on test set: {:.2f}'.format(accuracy_score(y_test, ensemble_pred)))


In [None]:
'''Graph of Spend 2'''
fig = px.histogram(train, x="Spend 2", color="Target", marginal="box", hover_data=train.columns)
fig.show()

In [None]:
'''check which cat_cols do not have any null values'''
for col in cat_cols[:-1]:
    print(col, train[col].isnull().sum())
    print(col, test[col].isnull().sum())

'''Run test between train and test set to see if they are from the same distribution'''
for col in ['Delinquency 63', 'Delinquency 64']:
    print(col, stats.ks_2samp(train[col], test[col]))

In [None]:
print(train['Delinquency 63'].value_counts())
print(test['Delinquency 63'].value_counts())

In [None]:
import scipy.stats as stats

# Calculate observed frequency for each category in each dataframe
train_observed_freq = train['Delinquency 63'].value_counts()
test_observed_freq = test['Delinquency 63'].value_counts()

# Calculate the expected frequency for each category in each dataframe
train_expected_freq = train['Delinquency 63'].value_counts(normalize=True) * len(train)
test_expected_freq = test['Delinquency 63'].value_counts(normalize=True) * len(test)

# Compute Chi-Squared statistic and p-value for train data
chi2, p, dof, ex = stats.chi2_contingency([train_observed_freq, train_expected_freq])
print(f'Chi-Squared Statistic (Train): {chi2:.3f}')
print(f'p-value (Train): {p:.3f}')

# Compute Chi-Squared statistic and p-value for test data
chi2, p, dof, ex = stats.chi2_contingency([test_observed_freq, test_expected_freq])
print(f'Chi-Squared Statistic (Test): {chi2:.3f}')
print(f'p-value (Test): {p:.3f}')


In [None]:
train_expected_freq

In [None]:
test_expected_freq

In [None]:
'''Run test between train and test set to see if they are from the same distribution'''
for col in ['Delinquency 63', 'Delinquency 64']:
    print(col, stats.ks_2samp(train[col], test[col]))

In [None]:
train63 = le.fit_transform(train['Delinquency 63'])

In [None]:
pd.crosstab()

In [None]:
'''plot delinquency 63 and 64'''
fig = make_subplots(rows=1, cols=2, subplot_titles=('Delinquency 63', 'Delinquency 64'))
fig.add_trace(go.Histogram(x=train['Delinquency 63'], name='Train'), row=1, col=1)
fig.add_trace(go.Histogram(x=test['Delinquency 63'], name='Test'), row=1, col=1)
fig.add_trace(go.Histogram(x=train['Delinquency 64'], name='Train'), row=1, col=2)
fig.add_trace(go.Histogram(x=test['Delinquency 64'], name='Test'), row=1, col=2)

In [None]:
temp = train['Spend 2'].groupby(train.customer_ID).max().value_counts().reset_index()
temp
temp['index'] = pd.to_datetime(temp['Spend 2'])
temp

In [None]:
'''plot histogram of S_2'''
fig = px.histogram(train, x="Spend 2", nbins=700, title='Spend Histogram')
fig.show()


In [None]:
'''select numerical columns in dataframe'''
train.dtypes.value_counts()

In [None]:
'''returns t test result between two dataframes'''
from scipy.stats import ttest_ind
def t_test(df1, df2, col):
    return ttest_ind(df1[col], df2[col])

In [None]:
t_test(train, test, 'Balance 24')

In [None]:
temp=dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), 
                           height=500, width=1000))
                           
target=train.Target.value_counts(normalize=True)
target.rename(index={1:'Default',0:'Paid'},inplace=True)
pal, color=['#016CC9','#DEB078'], ['#8DBAE2','#EDD3B3']
fig=go.Figure()
fig.add_trace(go.Pie(labels=target.index, values=target*100, hole=.45, 
                     showlegend=True,sort=False, 
                     marker=dict(colors=color,line=dict(color=pal,width=2.5)),
                     hovertemplate = "%{label} Accounts: %{value:.2f}%<extra></extra>"))
fig.update_layout(template=temp, title='Target Distribution', 
                  legend=dict(traceorder='reversed',y=1.05,x=0),
                  uniformtext_minsize=15, uniformtext_mode='hide',width=700)
fig.show()

In [None]:
# '''plotly express bar charts for Spend 1-30'''
# fig = make_subplots(rows=10, cols=3, subplot_titles=['Spend 1', 'Spend 2', 'Spend 3', 'Spend 4', 'Spend 5', 'Spend 6', 'Spend 7', 'Spend 8', 'Spend 9', 'Spend 10', 'Spend 11', 'Spend 12', 'Spend 13', 'Spend 14', 'Spend 15', 'Spend 16', 'Spend 17', 'Spend 18', 'Spend 19', 'Spend 20', 'Spend 21', 'Spend 22', 'Spend 23', 'Spend 24', 'Spend 25', 'Spend 26', 'Spend 27', 'Spend 28', 'Spend 29', 'Spend 30'])
# for i, col in enumerate(['Spend 1', 'Spend 2', 'Spend 3', 'Spend 4', 'Spend 5', 'Spend 6', 'Spend 7', 'Spend 8', 'Spend 9', 'Spend 10', 'Spend 11', 'Spend 12', 'Spend 13', 'Spend 14', 'Spend 15', 'Spend 16', 'Spend 17', 'Spend 18', 'Spend 19', 'Spend 20', 'Spend 21', 'Spend 22', 'Spend 23', 'Spend 24', 'Spend 25', 'Spend 26', 'Spend 27', 'Spend 28', 'Spend 29', 'Spend 30']):
#     row = i//3 + 1
#     col = i%3 + 1
#     fig.add_trace(go.Bar(x=train[col].value_counts().index, y=train[col].value_counts().values), row=row, col=col)
# fig.update_layout(height=1000, width=1000, title_text="Distribution of Spend Variables")
# fig.show()

Further Investigation Ideas from Kaggle

In [None]:
#Reload
train = pd.read_feather('C:/Users/tyler/Desktop/Kaggle/Amex Feather Data/train_data.ftr')
test = pd.read_feather('C:/Users/tyler/Desktop/Kaggle/Amex Feather Data/test_data.ftr')

change_col_name(train)
change_col_name(test)

train = train.groupby('customer_ID').tail(1).set_index('customer_ID')
test = test.groupby('customer_ID').tail(1).set_index('customer_ID')
del test['Spend 2']
del train['Spend 2']

In [None]:
#Provided by Amex in a Kaggle notebook

def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
#Idea for lgbm model from this kaggle notebook by Kelli Belcher https://www.kaggle.com/code/kellibelcher/amex-default-prediction-eda-lgbm-baseline#4-%7C-Default-Prediction

enc = LabelEncoder()
for col in cat_cols[:-1]:
    train[col] = enc.fit_transform(train[col])
    test[col] = enc.transform(test[col])

X=train.drop(['Target'],axis=1)
y=train['Target']
y_valid, gbm_val_probs, gbm_test_preds, gini=[],[],[],[]
ft_importance=pd.DataFrame(index=X.columns)
sk_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, val_idx) in enumerate(sk_fold.split(X, y)):
    
    X_train, y_train = X.iloc[train_idx,:], y[train_idx]
    X_val, y_val = X.iloc[val_idx,:], y[val_idx]
    
    print(f'Train shape {X_train.shape}, {y_train.shape}. Valid shape: {X_val.shape}, {y_val.shape}')

    
    gbm = LGBMClassifier().fit(X_train, y_train, 
                                eval_set=[(X_train, y_train), (X_val, y_val)],
                                callbacks=[early_stopping(200), log_evaluation(200)],
                                eval_metric=['auc'])
    
    gbm_prob = gbm.predict_proba(X_val)[:,1]
    gbm_val_probs.append(gbm_prob)
    y_valid.append(y_val)
    
    y_pred=pd.DataFrame(data={'prediction':gbm_prob})
    y_true=pd.DataFrame(data={'target':y_val.reset_index(drop=True)})
    gini_score=amex_metric(y_true = y_true, y_pred = y_pred)
    gini.append(gini_score)
    
    auc_score=roc_auc_score(y_val, gbm_prob)
    gbm_test_preds.append(gbm.predict_proba(test)[:,1])

    print(f'Fold {fold} Gini score: {gini_score}, AUC score: {auc_score}')

del X_train, y_train, X_val, y_val

change_col_name(train)
change_col_name(test)

In [None]:
cols=[col for col in train.columns if (col.startswith(('S','T'))) & (col not in cat_cols[:-1])]
plot_df=train[cols]

for col in plot_df:
    plot_df[col]=plot_df[col].astype(float)

#Plot the distributions of the spend and target variables with loop
fig, ax = plt.subplots(2, 2, figsize=(15, 10))
for i, col in enumerate(plot_df.columns):
    sns.distplot(plot_df[col], ax=ax[i//2, i%2])
    ax[i//2, i%2].set_title(col)
plt.show()
