In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# https://www.statsmodels.org/stable/index.html
import statsmodels.api as sm

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from functools import partial

In [None]:
from dotenv import load_dotenv

from pathlib import Path

env_path = Path("../../.env-live")

if env_path.exists():
    print('envs Loaded')
    load_dotenv(dotenv_path=env_path)
from jrjModelRegistry.jrjModelRegistry import registerAJrjModel

In [None]:
def generalRegressionPredictor(self, transformedData):
    return self.predict(transformedData)

In [None]:
mortgageDf = pd.read_excel("./Mortgage.xlsx")
# mortgageDf = pd.read_excel("https://www.dropbox.com/scl/fi/32vgpt3jvtztu86avdnwg/Mortgage.xlsx?rlkey=qx1d46hzgn4h67zrcyajdyl3e&dl=1")
mortgageDf

In [None]:
mortgageSampleData = {
    "x1": [16.35, 20, 20, 40],
    "x2": [49.94, 30, 15, 50]
}
mortgageSampleData

In [None]:
mortgageDf.size

In [None]:
mortgageDf.describe()

In [None]:
mortgageDf.shape

In [None]:
import matplotlib.pyplot as plt

In [None]:
# Plotting
fig1 = plt.figure(
  figsize=(8, 8)
)

In [None]:
plt.scatter(
  mortgageDf["x1"],
  mortgageDf["y"],
  color='blue',
  alpha=0.9,
  label='Data Points - scatter',
)

plt.xlabel('x1')
plt.ylabel('y')
plt.legend()
plt.grid(True)



plt.show()

In [None]:
plt.scatter(
  mortgageDf["x2"],
  mortgageDf["y"],
  color='blue',
  alpha=0.9,
  label='Data Points - scatter',
)

plt.xlabel('x2')
plt.ylabel('y')
plt.legend()
plt.grid(True)



plt.show()

In [None]:
def mortgageRegModel1Transformer(dataForTransfer = None):
    import pandas as pd
    import statsmodels.api as sm
    if isinstance(dataForTransfer, pd.DataFrame):
        df = dataForTransfer.copy()
    else:
        df = pd.DataFrame(dataForTransfer)
    dfTransformer = sm.add_constant(df[["x1", "x2"]],has_constant='add')
    return dfTransformer

In [None]:
mortgageRegModel1 = sm.OLS(
  mortgageDf["y"],
  mortgageRegModel1Transformer(mortgageDf)
).fit() 
print(mortgageRegModel1.summary())

In [None]:
predict1 = mortgageRegModel1.predict(mortgageRegModel1Transformer(mortgageDf))
mortgageDf['predict1'] = predict1
mortgageDf

In [None]:
mortgageRegModel1.predict(mortgageRegModel1Transformer(mortgageSampleData))

In [None]:
mortgageRegModel1.transformer = mortgageRegModel1Transformer
mortgageRegModel1.mainPredictor = partial(generalRegressionPredictor, mortgageRegModel1)
registerAJrjModel(
    mortgageRegModel1,
    {
        "modelName":f"taoyu_ma__mortgageRegModel1",
        "version":"1.0.1",
        "params": mortgageRegModel1.params.to_dict(),
        "score": float(mortgageRegModel1.rsquared),
        "modelLibraray": 'sm.OLS',
        "libraryMetadata": {
            "pvalues": mortgageRegModel1.pvalues.to_dict(),
            "r_squared": float(mortgageRegModel1.rsquared),
            "adj_r_squared": float(mortgageRegModel1.rsquared_adj)
        },
    
        "sampleData": {
            "dataForTransfer": mortgageSampleData
        }
    }
)

In [None]:
mortgageLogRegModel2 = sm.Logit(
  mortgageDf["y"],
  mortgageRegModel1Transformer(mortgageDf)
).fit()
print(mortgageLogRegModel2.summary())

In [None]:
predict2 = mortgageLogRegModel2.predict(mortgageRegModel1Transformer(mortgageDf))
mortgageDf['predict2'] = predict2
mortgageDf

In [None]:
mortgageLogRegModel2.predict(mortgageRegModel1Transformer(mortgageSampleData))

In [None]:
mortgageLogRegModel2.transformer = mortgageRegModel1Transformer
mortgageLogRegModel2.mainPredictor = partial(generalRegressionPredictor, mortgageLogRegModel2)
registerAJrjModel(
    mortgageLogRegModel2,
    {
        "modelName": "mortgageLogRegModel2",
        "version": "1.0.1",
        "params": mortgageLogRegModel2.params.to_dict(),
        "score": float(mortgageLogRegModel2.prsquared),  # Pseudo R-squared
        "modelLibrary": "statsmodels.api.Logit",
        "libraryMetadata": {
            "pvalues": mortgageLogRegModel2.pvalues.to_dict(),
            "pseudo_r_squared": float(mortgageLogRegModel2.prsquared),
            "llf": float(mortgageLogRegModel2.llf),
            "aic": float(mortgageLogRegModel2.aic),
            "bic": float(mortgageLogRegModel2.bic)
        },
        "sampleData": {
            "dataForTransfer": mortgageSampleData
        }
    }
)

# For Visulization

In [None]:
model3 = sm.OLS(
  mortgageDf["y"],
  sm.add_constant(mortgageDf[["x1"]])
)
model3Fit = model3.fit()
print(model3Fit.summary())

In [None]:
model4 = sm.Logit(
  mortgageDf["y"],
  sm.add_constant(mortgageDf[["x1"]])
)
model4Fit = model4.fit()
print(model4Fit.summary())

In [None]:
min = 0
max = mortgageDf["x1"].max() + 10
x = np.linspace(min - 5, max + 5, 500)
import math
lREq = 0.0141 + x *  0.0227
logREq = pow(math.e, (-2.2077  + 0.1043 * x))/ (1+ pow(math.e, (-2.2077  + 0.1043 * x)))


In [None]:

plt.scatter(
  mortgageDf["x1"],
  mortgageDf["y"],
  color='blue',
  alpha=0.9,
  label='Data Points - scatter',
)

plt.plot(
  x,
  lREq,
  color='red',
  alpha=0.9,
  label='lREq',
)

plt.plot(
  x,
  logREq,
  color='green',
  alpha=0.9,
  label='logREq',
)

plt.xlabel('x1')
plt.ylabel('y')
plt.legend()
plt.grid(True)



plt.show()

In [None]:
mortgageDf['yHat2'] = mortgageDf['predict2'].apply(lambda x: 1 if x > 0.5 else 0)
mortgageDf

# Hold-out

In [None]:
from sklearn.model_selection import train_test_split
# Split the data into train and test sets
# trainSet, testSet = train_test_split(wagesDf, test_size=0.15, random_state=55)
trainSet, testSet = train_test_split(mortgageDf, test_size=0.15)

trainSet.head()

In [None]:
mortgageDf.shape, trainSet.shape, testSet.shape

In [None]:
modelHoldOut = sm.Logit(
  trainSet["y"],
  mortgageRegModel1Transformer(trainSet)
)
modelHoldOutFit = modelHoldOut.fit()
print(modelHoldOutFit.summary())

In [None]:
predictHoldOut = modelHoldOutFit.predict(mortgageRegModel1Transformer(testSet))
testSet['predictHoldOut'] = predictHoldOut
testSet

In [None]:
testSet['yHatHoldOut'] = testSet['predictHoldOut'].apply(lambda x: 1 if x > 0.5 else 0)
testSet['isHoldOutCorrect'] = testSet.apply(lambda row: 1 if row['y'] == row['yHatHoldOut'] else 0, axis=1)
testSet

In [None]:
accuracy = (np.sum(testSet['isHoldOutCorrect']) / len(testSet['yHatHoldOut'])) * 100
accuracy

# K-Fold Cross validation

In [None]:
from sklearn.model_selection import KFold

In [None]:
# Initialize KFold
kf = KFold(n_splits=5, shuffle=True, random_state=55)


In [None]:
check = kf.split(mortgageDf)
check
experiment = 1
# Loop through each fold
# Initialize variables to store results
accuracies = []
bestModel = None
bestAccuracy = 0

for train_index, val_index in check:
    # Split the data
    trainSet, valSet = mortgageDf.iloc[train_index], mortgageDf.iloc[val_index]

    # Fit the model

    trainModel = sm.Logit(
      trainSet["y"],
      mortgageRegModel1Transformer(trainSet)
    )
    trainModelFit = trainModel.fit()

    # Predict on the validation set
    val_predictions = trainModelFit.predict(mortgageRegModel1Transformer(valSet))
    valSet['val_predictions'] = val_predictions
    valSet['yHatCross'] = valSet['val_predictions'].apply(lambda x: 1 if x > 0.5 else 0)
    valSet['isCrossCorrect'] = valSet.apply(lambda row: 1 if row['y'] == row['yHatCross'] else 0, axis=1)
    accuracy = (np.sum(valSet['isCrossCorrect']) / len(valSet['yHatCross'])) * 100
    accuracies.append(accuracy)

    if accuracy > bestAccuracy: 
        bestAccuracy = accuracy
        bestModel = trainModelFit
    


    # Print summary for each fold (optional)
    print(f'expr={experiment}')
    experiment = experiment +1
    print(trainModelFit.summary())

In [None]:
accuracies

In [None]:
print(f"Average accuracies across all folds: {sum(accuracies) /len(accuracies)}")

In [None]:
bestModel.transformer = mortgageRegModel1Transformer
bestModel.mainPredictor = partial(generalRegressionPredictor, bestModel)
registerAJrjModel(
    bestModel,
    {
        "modelName": "to_predictLogRegModelBestCrossValidation",
        "version": "1.0.1",
        "params": bestModel.params.to_dict(),
        "score": bestAccuracy,  # Pseudo R-squared
        "modelLibrary": "statsmodels.api.Logit",
        "libraryMetadata": {
            "pvalues": bestModel.pvalues.to_dict(),
            "pseudo_r_squared": float(bestModel.prsquared),
            "llf": float(bestModel.llf),
            "aic": float(bestModel.aic),
            "bic": float(bestModel.bic)
        },
        "sampleData": {
            "dataForTransfer": mortgageSampleData
        }
    }
)