In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from category_encoders.hashing import HashingEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

import cdutils.database.connect

In [None]:
"""
Using the lookup query to inspect the DB tables
"""

import cdutils.database.connect # type: ignore
from sqlalchemy import text # type: ignore

def fetch_data():
    """
    Main data query
    """
    # Engine 1
    acctstatistichist = text("""
    SELECT 
        *
    FROM 
        OSIBANK.ACCTSTATISTICHIST
    """)

    doc = text("""
    SELECT 
        *
    FROM 
        OSIBANK.STATISTICTYP
    """)

    queries = [
        # {'key':'acctcommon', 'sql':acctcommon, 'engine':2},
        {'key':'acctstatistichist', 'sql':acctstatistichist, 'engine':1},
        {'key':'doc', 'sql':doc, 'engine':1}
    ]


    data = cdutils.database.connect.retrieve_data(queries)
    return data


In [None]:
"""
Using the lookup query to inspect the DB tables
"""

import cdutils.database.connect # type: ignore
from sqlalchemy import text # type: ignore

def fetch_data_historical_acctcommon(acctnbr):
    """
    Main data query
    """
    # Engine 1
    acctcommon_hist = text(f"""
    SELECT 
        a.ACCTNBR,
        a.PRODUCT,
        a.EFFDATE
    FROM 
        COCCDM.WH_ACCTCOMMON a
    WHERE
        a.ACCTNBR = {acctnbr}
        
    """)

    queries = [
        {'key':'wh_acctcommon', 'sql':acctcommon_hist, 'engine':2},
    ]


    data = cdutils.database.connect.retrieve_data(queries)
    return data


In [None]:
test = fetch_data_historical_acctcommon(150894486)

In [None]:
test = test['wh_acctcommon']
test

In [None]:
dataWithAcctStats

In [None]:
for index, row in tqdm(dataWithAcctStats.iterrows()):
    if row['product'] == 'Repossessed Collateral':
        acct = row['acctnbr']
        acct_data = fetch_data_historical_acctcommon(acct)
        acct_df = acct_data['wh_acctcommon']
        new_product = acct_df.iloc[0]['product']
        dataWithAcctStats.at[index, 'product'] = new_product


In [None]:
data_without_repo = dataWithAcctStats[(dataWithAcctStats['product'] != "Repossessed Collateral")]

In [None]:
data_without_repo

In [None]:
data = fetch_data()

In [None]:
acctstatistichist = data['acctstatistichist']
doc = data['doc']

In [None]:
acctstatistichist = pd.pivot_table(acctstatistichist,
                            index='acctnbr',
                            columns='statistictypcd',
                            values='statisticcount',
                            aggfunc='sum',
                            fill_value=0
                            ).reset_index()

In [None]:
acctstatistichist.columns

In [None]:
data = pd.read_csv("data.csv")

In [None]:
import cd

In [None]:
def accuracy_when_y_is_1(y, y_pred):
    return (str(((y == y_pred) & (y == 1)).sum() / len(y[y == 1]) * 100) + "%")

def getData():
    delinquency = pd.read_excel("Delinquency_013125.xlsx")
    data = pd.read_csv("data.csv")
    data = data.dropna(subset=['Category'])

    data['contractdate'] = pd.to_datetime(data['contractdate'])
    data['datemat'] = pd.to_datetime(data['datemat'])

    data['contract_to_maturity_days'] = (data['datemat'] - data['contractdate']).dt.days

    delinquency.columns = delinquency.iloc[3]
    delinquency = delinquency.rename(columns={'Account Number': 'acctnbr'})
    delinquency = delinquency.dropna(subset=['Customer Name'])

    data_with_delinquency = pd.merge(data, delinquency, on='acctnbr', how='left')
    data_with_delinquency.replace('', np.nan, inplace=True)
    data_with_delinquency.fillna(0, inplace=True)

    X = data_with_delinquency[[
    'noteopenamt',
    'ratetypcd', 
    'noteintrate', 
    #'contractdate', 
    #'datemat',
    'contract_to_maturity_days',
    'origintrate', 
    'riskratingcd',
    #'availbalamt',
    'NDPD'
    ]].copy()

    X['riskratingcd'] = X['riskratingcd'].str.replace(r'\D', '', regex=True)
    X.replace('', np.nan, inplace=True)
    X.fillna(0, inplace=True)
    y = data_with_delinquency['cobal'].copy()
    # converting cobal to binomial distribution
    y = (y > 0).astype(int)

    # append most recent delinquency file

    X_encoded = pd.get_dummies(X, columns=['ratetypcd'], prefix='ratetypcd')
    X_encoded['ratetypcd_FIX'], X_encoded['ratetypcd_VAR'] = X_encoded['ratetypcd_FIX'].astype(int), X_encoded['ratetypcd_VAR'].astype(int)

    return X_encoded, y, data_with_delinquency


def getDataWithAcctStats():
    data = pd.read_csv("data.csv")
    data = data.dropna(subset=['Category'])

    data['contractdate'] = pd.to_datetime(data['contractdate'])
    data['datemat'] = pd.to_datetime(data['datemat'])

    data['contract_to_maturity_days'] = (data['datemat'] - data['contractdate']).dt.days

    # delinquency.columns = delinquency.iloc[3]
    # delinquency = delinquency.rename(columns={'Account Number': 'acctnbr'})
    # delinquency = delinquency.dropna(subset=['Customer Name'])

    data_with_acct_stats = pd.merge(data, acctstatistichist, on='acctnbr', how='left')
    data_with_acct_stats.replace('', np.nan, inplace=True)
    data_with_acct_stats.fillna(0, inplace=True)

    for index, row in tqdm(data_with_acct_stats.iterrows()):
        if row['product'] == 'Repossessed Collateral':
            acct = row['acctnbr']
            acct_data = fetch_data_historical_acctcommon(acct)
            acct_df = acct_data['wh_acctcommon']
            new_product = acct_df.iloc[0]['product']
            data_with_acct_stats.at[index, 'product'] = new_product

    data_without_repo = data_with_acct_stats[(data_with_acct_stats['product'] != "Repossessed Collateral")]

    X = data_without_repo[[
        'loanofficer',
        'product',
        #'noteopenamt',
        'ratetypcd', 
        'noteintrate', 
        #'contractdate', 
        #'datemat',
        'contract_to_maturity_days',
        'origintrate', 
        'riskratingcd',
        #'availbalamt',
        'DOD', 'EFEE', 'EXT', 'KITE', 'MCHG', 'NSF', 'PD', 'PD12',
        'PD15', 'PD18', 'PD30', 'PD60', 'PD90', 'RGD3', 'RGD6', 'RNEW', 'SKIP',
        'UCF']].copy()

    X['riskratingcd'] = X['riskratingcd'].str.replace(r'\D', '', regex=True)
    X.replace('', np.nan, inplace=True)
    X.fillna(0, inplace=True)
    y = data_without_repo['cobal'].copy()
    # converting cobal to binomial distribution
    y = (y > 0).astype(int)

    # append most recent delinquency file

    X_encoded = pd.get_dummies(X, columns=['ratetypcd'], prefix='ratetypcd')
    X_encoded['ratetypcd_FIX'], X_encoded['ratetypcd_VAR'] = X_encoded['ratetypcd_FIX'].astype(int), X_encoded['ratetypcd_VAR'].astype(int)
    print(X.columns)

    X_final = pd.get_dummies(X_encoded, columns=['product'], prefix='product')
    X_final_final = pd.get_dummies(X_final, columns=['loanofficer'], prefix='LOANOFFICER')

    #X_final['ratetypcd_FIX'], X_encoded['ratetypcd_VAR'] = X_encoded['ratetypcd_FIX'].astype(int), X_encoded['ratetypcd_VAR'].astype(int)

    return X_final_final, y, data_without_repo

In [None]:
X, y, data_with_delinquency = getData()

In [None]:
X

In [None]:
X, y, data_with_delinquency = getData()

# features_to_be_scaled = ['noteopenamt', 'noteintrate', 'origintrate',
#                          'riskratingcd', 'contract_to_maturity_days', 'NDPD']
features_to_be_scaled = ['noteopenamt', 'noteintrate', 'origintrate',
                         'contract_to_maturity_days']

scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_train[features_to_be_scaled] = scaler.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = scaler.transform(X_test[features_to_be_scaled])

In [None]:
data_with_delinquency

In [None]:
model = LogisticRegression(class_weight='balanced')

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
y_probs = model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
print(("ROC AUC:", roc_auc_score(y_test, y_probs)))

In [None]:
accuracy_when_y_is_1(y_test, y_pred)

In [None]:
coefficients = model.coef_[0]
feature_names = X.columns
for name, coef in zip(feature_names, coefficients):
    print(f"{name} {coef:.12f}")

In [None]:
X_train

### Trying without ndpd but using account history

In [None]:
X, y, dataWithAcctStats = getDataWithAcctStats()

In [None]:
dataWithAcctStats

In [None]:
dataWithAcctStats.columns

In [None]:
identifiers = dataWithAcctStats[['ownersortname', 'product']]

In [None]:
features_to_be_scaled = [ 'noteintrate', 'origintrate', 'riskratingcd',
                         'contract_to_maturity_days', 'DOD', 'EFEE', 'EXT', 'KITE', 'MCHG', 'NSF', 'PD', 'PD12',
       'PD15', 'PD18', 'PD30', 'PD60', 'PD90', 'RGD3', 'RGD6', 'RNEW', 'SKIP',
       'UCF']

#maybe scale riskrating also


scaler = StandardScaler()
X_train, X_test, y_train, y_test, identifiers_train, identifiers_test = train_test_split(X, y, identifiers, test_size = 0.2)

X_test_copy = X_test.copy()
y_test_copy = y_test.copy()
X_train[features_to_be_scaled] = scaler.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = scaler.transform(X_test[features_to_be_scaled])

In [None]:
model = LogisticRegression(class_weight='balanced', max_iter=100000)

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
accuracy_when_y_is_1(y_test, y_pred)

In [None]:
len(y_test)

In [None]:
coefficients = model.coef_[0]
feature_names = X.columns
for name, coef in zip(feature_names, coefficients):
    print(f"{name} {coef:.12f}")

In [None]:
identifiers_test

In [None]:
X_test_copy['y'] = y_test
X_test_copy['y_pred'] = y_pred
X_test_copy = pd.concat([identifiers_test, X_test_copy], axis=1)
# X_test_copy['Customer Name'] = identifiers_test['ownersortname']
# X_test_copy['Product'] = identifiers_test['product']

In [None]:
X_test_copy

In [None]:
false_positives = X_test_copy[(X_test_copy['y'] == 0) & (X_test_copy['y_pred'] == 1)] # danger zone
false_negatives = X_test_copy[(X_test_copy['y'] == 1) & (X_test_copy['y_pred'] == 0)]
true_positives = X_test_copy[(X_test_copy['y'] == 1) & (X_test_copy['y_pred'] == 1)]
true_negatives = X_test_copy[(X_test_copy['y'] == 0) & (X_test_copy['y_pred'] == 0)]

In [None]:
false_positives

In [None]:
false_positives.describe()

In [None]:
false_positives.to_excel("danger_zone.xlsx", index=False)

In [None]:
false_positives.describe().to_excel("false_positives.xlsx")
false_negatives.describe().to_excel("false_negatives.xlsx")
true_positives.describe().to_excel("true_positives.xlsx")
true_negatives.describe().to_excel("true_negatives.xlsx")


In [None]:
X_test_copy

In [None]:
acctstatistichist

maybe get rid of nsf, try encoding product column

In [None]:
X, y, dataWithAcctStats = getDataWithAcctStats()

In [None]:
X

In [None]:
# X, y, dataWithAcctStats = getDataWithAcctStats()
X = X.drop(columns="NSF")
X.iloc[:, 25:] = X.iloc[:, 25:].astype(int)
identifiers = dataWithAcctStats[['ownersortname']]

features_to_be_scaled = ['noteintrate', 'origintrate', 'riskratingcd',
                         'contract_to_maturity_days', 'DOD', 'EFEE', 'EXT', 'KITE', 'MCHG', 'PD', 'PD12',
       'PD15', 'PD18', 'PD30', 'PD60', 'PD90', 'RGD3', 'RGD6', 'RNEW', 'SKIP',
       'UCF']


scaler = StandardScaler()
X_train, X_test, y_train, y_test, identifiers_train, identifiers_test = train_test_split(X, y, identifiers, test_size = 0.2)

X_test_copy = X_test.copy()
y_test_copy = y_test.copy()
X_train[features_to_be_scaled] = scaler.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = scaler.transform(X_test[features_to_be_scaled])

# encoder = HashingEncoder(cols=['product'], n_components=10)
# encoder.fit(X_train)
# X_train, X_test = encoder.transform(X_train), encoder.transform(X_test)


In [None]:
X_train

In [None]:
model = LogisticRegression(class_weight='balanced', max_iter=100000)

model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print("Accuracy when y is 1: " + accuracy_when_y_is_1(y_test, y_pred))

coefficients = model.coef_[0]
feature_names = X.columns
for name, coef in zip(feature_names, coefficients):
    print(f"{name} {coef:.12f}")

In [None]:
# Sort coefficients by absolute value
sorted_indices = np.argsort((coefficients))[::-1]  # Sort in descending order

# Print sorted coefficients and feature names
print("Sorted Coefficients (by magnitude):")
for index in sorted_indices:
    print(f"{feature_names[index]}: {coefficients[index]}")

In [None]:
X_test_copy['y'] = y_test
X_test_copy['y_pred'] = y_pred
X_test_copy = pd.concat([identifiers_test, X_test_copy], axis=1)

In [None]:
repos = X[(X['product_Repossessed Collateral'] == 1)]
repos_with_cobal = X[(X['product_Repossessed Collateral'] == 1) & X['y'] == 1]
print(len(repos))
print(len(repos_with_cobal))
print(len(X))

Random Forest Classifier

In [None]:
X, y, dataWithAcctStats = getDataWithAcctStats()
X_train, X_test, y_train, y_test, identifiers_train, identifiers_test = train_test_split(X, y, identifiers, test_size = 0.2)

In [None]:
rfc = RandomForestClassifier(class_weight='balanced')
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)
y_probs = rfc.predict_proba(X_test)[:, 1]


print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))
print(("ROC AUC:", roc_auc_score(y_test, y_probs)))

In [None]:
accuracy_when_y_is_1(y_test, y_pred)

In [None]:
X, y = getData()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

rfc = RandomForestClassifier(class_weight='balanced')
rfc.fit(X_train, y_train)

In [None]:
y_pred = rfc.predict(X_test)
y_probs = rfc.predict_proba(X_test)[:, 1]


print("Accuracy score: " + str(accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))
print(("ROC AUC:", roc_auc_score(y_test, y_probs)))

In [None]:
accuracy_when_y_is_1(y_test, y_pred)

# false positives are in the danger zone

In [None]:
(str(((y == y_pred) & (y == 1)).sum() / len(y[y == 1]) * 100) + "%")

In [None]:
# true positives
((y_test == y_pred) & (y_test == 1)).sum().item()

In [None]:
# true positives
((y_test != y_pred) & (y_test == 1)).sum().item()

In [None]:
len(y_test[y_test == 1])

In [None]:
len(y_pred[y_pred == 1])