In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("data.csv")

In [None]:
data

In [None]:
data = data.dropna(subset=['Category'])

In [None]:
data

### Balancing the dataset

In [None]:
positive = data[data['cobal'] > 0]
non_positive = data[data['cobal'] <= 0]

non_positive_sampled = non_positive.sample(n=len(positive), random_state=42)
balanced_data = pd.concat([positive, non_positive_sampled], ignore_index=True)

In [None]:
X = balanced_data[[
    'noteopenamt',
    'ratetypcd', 
    'noteintrate', 
    'bookbalance', 
    'notebal', 
    #'contractdate', 
    #'datemat',
    'origintrate', 
    'riskratingcd',
    'availbalamt',
    'Net Balance',
    'Net Available',
    'Net Collateral Reserve',
    'Total Exposure',
    'orig_ttl_loan_amt'
    ]].copy()
X['riskratingcd'] = X['riskratingcd'].str.replace(r'\D', '', regex=True)
X.replace('', np.nan, inplace=True)
X.fillna(0, inplace=True)
y = balanced_data['cobal'].copy()
# converting cobal to binomial distribution
y = (y > 0).astype(int)

# append most recent delinquency file

In [None]:
X

### One hot encoding ratetypecd

In [None]:
X_encoded = pd.get_dummies(X, columns=['ratetypcd'], prefix='ratetypcd')
X_encoded['ratetypcd_FIX'], X_encoded['ratetypcd_VAR'] = X_encoded['ratetypcd_FIX'].astype(int), X_encoded['ratetypcd_VAR'].astype(int)

In [None]:
X_encoded

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features_to_be_scaled = ['noteopenamt', 'noteintrate', 'bookbalance', 
                         'notebal', 'origintrate','riskratingcd', 
                         'availbalamt', 'Net Balance', 'Net Available', 
                         'Net Collateral Reserve', 'Total Exposure', 'orig_ttl_loan_amt']

scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size = 0.2)
X_train[features_to_be_scaled] = scaler.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = scaler.transform(X_test[features_to_be_scaled])



In [None]:
# correct_when_y_is_1 = ((y_test == y_pred) & (y_test == 1)).sum()
# correct_when_y_is_1
ones_in_y = y_test[y_test == 1]
ones_in_y
#y_test

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    penalty='l1',
    solver='saga',
    C=1.0,
    max_iter=1000
)

In [None]:
model.fit(X_train, y_train)

In [None]:
# from sklearn.metrics import classification_report
y_pred = model.predict(X_test)
# classification_report(y, y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
accuracy_when_y_is_1 = (str(((y_test == y_pred) & (y_test == 1)).sum() / len(y_test[y_test == 1]) * 100) + "%")
accuracy_when_y_is_1

### Testing model on entire unbalanced dataset (it has already been trained on about 600 of those rows)

In [None]:
X_total = data[[
    'noteopenamt',
    'ratetypcd', 
    'noteintrate', 
    'bookbalance', 
    'notebal', 
    #'contractdate', 
    #'datemat',
    'origintrate', 
    'riskratingcd',
    'availbalamt',
    'Net Balance',
    'Net Available',
    'Net Collateral Reserve',
    'Total Exposure',
    'orig_ttl_loan_amt'
    ]].copy()

X_total['riskratingcd'] = X_total['riskratingcd'].str.replace(r'\D', '', regex=True)
X_total.replace('', np.nan, inplace=True)
X_total.fillna(0, inplace=True)
y_total = data['cobal'].copy()
# converting cobal to binomial distribution
y_total = (y_total > 0).astype(int)

# append most recent delinquency file

In [None]:
# one hot encoding ratetypecd
X_encoded = pd.get_dummies(X_total, columns=['ratetypcd'], prefix='ratetypcd')
X_encoded['ratetypcd_FIX'], X_encoded['ratetypcd_VAR'] = X_encoded['ratetypcd_FIX'].astype(int), X_encoded['ratetypcd_VAR'].astype(int)

In [None]:
features_to_be_scaled = ['noteopenamt', 'noteintrate', 'bookbalance', 
                         'notebal', 'origintrate','riskratingcd', 
                         'availbalamt', 'Net Balance', 'Net Available', 
                         'Net Collateral Reserve', 'Total Exposure', 'orig_ttl_loan_amt']

scaler = StandardScaler()
X_encoded[features_to_be_scaled] = scaler.fit_transform(X_encoded[features_to_be_scaled])

In [None]:
y_pred_total = model.predict(X_encoded)
accuracy_score(y_total, y_pred_total)

In [None]:
# correct_when_y_is_1 = ((y_total == y_pred_total) & (y_total == 1)).sum().item()
# correct_when_y_is_1

accuracy_when_y_is_1 = (str(((y_total == y_pred_total) & (y_total == 1)).sum() / len(y_total[y_total == 1]) * 100) + "%")
accuracy_when_y_is_1

In [None]:
data['y'] = y_total
data['y_pred'] = y_pred_total

In [None]:
danger_zone = data[(data['y'] == 0)  & (data['y_pred'] == 1)]

In [None]:
danger_zone

In [None]:
coefficients = model.coef_[0]
feature_names = X_encoded.columns
for name, coef in zip(feature_names, coefficients):
    print(f"{name} {coef:.12f}")

Net Balance and Total Exposure have a large impact on negative results (no cobal)

### Trying without bookbal, net bal, note bal, net available, net collat, total exposure, and orig_ttl_loan_amt columns

In [None]:
data = pd.read_csv("data.csv")
data = data.dropna(subset=['Category'])

positive = data[data['cobal'] > 0]
non_positive = data[data['cobal'] <= 0]

non_positive_sampled = non_positive.sample(n=len(positive), random_state=42)
balanced_data = pd.concat([positive, non_positive_sampled], ignore_index=True)

X = balanced_data[[
    'noteopenamt',
    'ratetypcd', 
    'noteintrate', 
    #'contractdate', 
    #'datemat',
    'origintrate', 
    'riskratingcd',
    'availbalamt',
    ]].copy()

X['riskratingcd'] = X['riskratingcd'].str.replace(r'\D', '', regex=True)
X.replace('', np.nan, inplace=True)
X.fillna(0, inplace=True)
y = balanced_data['cobal'].copy()
# converting cobal to binomial distribution
y = (y > 0).astype(int)

# append most recent delinquency file

X_encoded = pd.get_dummies(X, columns=['ratetypcd'], prefix='ratetypcd')
X_encoded['ratetypcd_FIX'], X_encoded['ratetypcd_VAR'] = X_encoded['ratetypcd_FIX'].astype(int), X_encoded['ratetypcd_VAR'].astype(int)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features_to_be_scaled = ['noteopenamt', 'noteintrate', 'origintrate',
                         'riskratingcd', 'availbalamt']

scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size = 0.2)
X_train[features_to_be_scaled] = scaler.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = scaler.transform(X_test[features_to_be_scaled])



In [None]:
model2 = LogisticRegression(
    penalty='l1',
    solver='saga',
    C=1.0,
    max_iter=1000
)

model2.fit(X_train, y_train)

In [None]:
y_pred = model2.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
accuracy_when_y_is_1 = (str(((y_test == y_pred) & (y_test == 1)).sum() / len(y_test[y_test == 1]) * 100) + "%")
accuracy_when_y_is_1

In [None]:
coefficients = model2.coef_[0]
feature_names = X_encoded.columns
for name, coef in zip(feature_names, coefficients):
    print(f"{name} {coef:.12f}")

### Adding delinquency feature

In [None]:
delinquency = pd.read_excel("Delinquency_013125.xlsx")
data = pd.read_csv("data.csv")
data = data.dropna(subset=['Category'])

In [None]:
delinquency.columns = delinquency.iloc[3]
delinquency = delinquency.rename(columns={'Account Number': 'acctnbr'})
delinquency = delinquency.dropna(subset=['Customer Name'])
delinquency

In [None]:
data_with_delinquency = pd.merge(data, delinquency, on='acctnbr', how='left')

In [None]:
data_with_delinquency

In [None]:
data_with_delinquency.replace('', np.nan, inplace=True)
data_with_delinquency.fillna(0, inplace=True)

In [None]:
data_with_delinquency

In [None]:
positive = data_with_delinquency[data_with_delinquency['cobal'] > 0]
non_positive = data_with_delinquency[data_with_delinquency['cobal'] <= 0]

non_positive_sampled = non_positive.sample(n=len(positive), random_state=42)
balanced_data = pd.concat([positive, non_positive_sampled], ignore_index=True)

X = balanced_data[[
    'noteopenamt',
    'ratetypcd', 
    'noteintrate', 
    #'contractdate', 
    #'datemat',
    'origintrate', 
    'riskratingcd',
    #'availbalamt',
    'NDPD'
    ]].copy()

X['riskratingcd'] = X['riskratingcd'].str.replace(r'\D', '', regex=True)
X.replace('', np.nan, inplace=True)
X.fillna(0, inplace=True)
y = balanced_data['cobal'].copy()
# converting cobal to binomial distribution
y = (y > 0).astype(int)

# append most recent delinquency file

X_encoded = pd.get_dummies(X, columns=['ratetypcd'], prefix='ratetypcd')
X_encoded['ratetypcd_FIX'], X_encoded['ratetypcd_VAR'] = X_encoded['ratetypcd_FIX'].astype(int), X_encoded['ratetypcd_VAR'].astype(int)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features_to_be_scaled = ['noteopenamt', 'noteintrate', 'origintrate',
                         'riskratingcd', 'NDPD']

scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size = 0.2)
X_train[features_to_be_scaled] = scaler.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = scaler.transform(X_test[features_to_be_scaled])

In [None]:
model3 = LogisticRegression(
    penalty='l1',
    solver='saga',
    C=1.0,
    max_iter=100000
)

model3.fit(X_train, y_train)

In [None]:
y_pred = model3.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
accuracy_when_y_is_1 = (str(((y_test == y_pred) & (y_test == 1)).sum() / len(y_test[y_test == 1]) * 100) + "%")
accuracy_when_y_is_1

In [None]:
coefficients = model3.coef_[0]
feature_names = X_encoded.columns
for name, coef in zip(feature_names, coefficients):
    print(f"{name} {coef:.12f}")

### Adding contract_to_maturity_days feature

In [None]:
from datetime import datetime

delinquency = pd.read_excel("Delinquency_013125.xlsx")
data = pd.read_csv("data.csv")
data = data.dropna(subset=['Category'])

data['contractdate'] = pd.to_datetime(data['contractdate'])
data['datemat'] = pd.to_datetime(data['datemat'])

data['contract_to_maturity_days'] = (data['datemat'] - data['contractdate']).dt.days

delinquency.columns = delinquency.iloc[3]
delinquency = delinquency.rename(columns={'Account Number': 'acctnbr'})
delinquency = delinquency.dropna(subset=['Customer Name'])

data_with_delinquency = pd.merge(data, delinquency, on='acctnbr', how='left')
data_with_delinquency.replace('', np.nan, inplace=True)
data_with_delinquency.fillna(0, inplace=True)

In [None]:
positive = data_with_delinquency[data_with_delinquency['cobal'] > 0]
non_positive = data_with_delinquency[data_with_delinquency['cobal'] <= 0]

non_positive_sampled = non_positive.sample(n=len(positive), random_state=42)
balanced_data = pd.concat([positive, non_positive_sampled], ignore_index=True)

X = balanced_data[[
    'noteopenamt',
    'ratetypcd', 
    'noteintrate', 
    #'contractdate', 
    #'datemat',
    'contract_to_maturity_days',
    'origintrate', 
    'riskratingcd',
    #'availbalamt',
    'NDPD'
    ]].copy()

X['riskratingcd'] = X['riskratingcd'].str.replace(r'\D', '', regex=True)
X.replace('', np.nan, inplace=True)
X.fillna(0, inplace=True)
y = balanced_data['cobal'].copy()
# converting cobal to binomial distribution
y = (y > 0).astype(int)

# append most recent delinquency file

X_encoded = pd.get_dummies(X, columns=['ratetypcd'], prefix='ratetypcd')
X_encoded['ratetypcd_FIX'], X_encoded['ratetypcd_VAR'] = X_encoded['ratetypcd_FIX'].astype(int), X_encoded['ratetypcd_VAR'].astype(int)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features_to_be_scaled = ['noteopenamt', 'noteintrate', 'origintrate',
                         'riskratingcd', 'contract_to_maturity_days', 'NDPD']

scaler = StandardScaler()
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size = 0.2)
X_train[features_to_be_scaled] = scaler.fit_transform(X_train[features_to_be_scaled])
X_test[features_to_be_scaled] = scaler.transform(X_test[features_to_be_scaled])

In [None]:
model4 = LogisticRegression(
    penalty='l1',
    solver='saga',
    C=1.0,
    max_iter=1000
)

model4.fit(X_train, y_train)

In [None]:
y_pred = model4.predict(X_test)

from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
accuracy_when_y_is_1 = (str(((y_test == y_pred) & (y_test == 1)).sum() / len(y_test[y_test == 1]) * 100) + "%")
accuracy_when_y_is_1

In [None]:
coefficients = model4.coef_[0]
feature_names = X_encoded.columns
for name, coef in zip(feature_names, coefficients):
    print(f"{name} {coef:.12f}")

In [None]:
X_total = data_with_delinquency[[
    'noteopenamt',
    'ratetypcd', 
    'noteintrate', 
    #'contractdate', 
    #'datemat',
    'contract_to_maturity_days',
    'origintrate', 
    'riskratingcd',
    #'availbalamt',
    'NDPD'
    ]].copy()

X_total['riskratingcd'] = X_total['riskratingcd'].str.replace(r'\D', '', regex=True)
X_total.replace('', np.nan, inplace=True)
X_total.fillna(0, inplace=True)
y_total = data_with_delinquency['cobal'].copy()
# converting cobal to binomial distribution
y_total = (y_total > 0).astype(int)

# append most recent delinquency file

X_encoded = pd.get_dummies(X_total, columns=['ratetypcd'], prefix='ratetypcd')
X_encoded['ratetypcd_FIX'], X_encoded['ratetypcd_VAR'] = X_encoded['ratetypcd_FIX'].astype(int), X_encoded['ratetypcd_VAR'].astype(int)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features_to_be_scaled = ['noteopenamt', 'noteintrate', 'origintrate',
                         'riskratingcd', 'contract_to_maturity_days', 'NDPD']


scaler = StandardScaler()
X_encoded[features_to_be_scaled] = scaler.fit_transform(X_encoded[features_to_be_scaled])

In [None]:
y_pred_total = model4.predict(X_encoded)
accuracy_score(y_total, y_pred_total)

In [None]:
probabilities = model4.predict_proba(X_encoded)[:, 1]

threshold = np.percentile(probabilities, 99)
pred_top_1_percent = (probabilities >= threshold).astype(int)

In [None]:
accuracy_score(y_total, pred_top_1_percent)

In [None]:
accuracy_when_y_is_1 = (str(((y_total == pred_top_1_percent) & (y_total == 1)).sum() / len(y_total[y_total == 1]) * 100) + "%")
accuracy_when_y_is_1

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_train)

model4.fit(X_poly, y_train)

In [None]:
model4.fit

#### Manual implementation

In [None]:
# manual
from tqdm import tqdm

def q(x=None, X = None, w = None):
    if X is None:
        return 1 / (1 + np.exp(-x.T.dot(w)))
    else:
        return 1 / (1 + np.exp(-X.dot(w)))
    
def dL(Φ, y, w):
    return Φ.T.dot(q(None, Φ, w) - y) / len(y)

def gradient_descent(Φ, y, w, η):
    for i in tqdm(range(5)):
        w = w - η * dL(Φ, y, w)
    return w

w = np.ones((X_train.shape[1], 1))
w = gradient_descent(X_train, y_train, w, 0.1)

def logistic_predict(X, w):
    return (q(None, X, w) >= 0.5).astype(int)

In [None]:
lr_train_pred.shape

In [None]:
lr_train_pred[0]

In [None]:
y_train

In [None]:
lr_train_pred = logistic_predict(X_train, w)
lr_train_accuracy = np.mean(lr_train_pred[0] == y_train)

In [None]:
lr_train_accuracy

In [None]:
coefficients = w
feature_names = X_encoded.columns
for name, coef in zip(feature_names, coefficients):
    print(f"{name} {coef:.12f}")

In [None]:
y_train

In [None]:
lr_train_pred[0]

In [None]:
lr_train_pred = lr_train_pred[lr_train_pred[0] != 0]

In [None]:
lr_train_pred[0]