### Introduction to Machine Learning in Finance and Insurance (Spring 2024)
# Project 1: Credit Analytics

### Team members: LastName1 FirstName1, LastName2 FirstName2, LastName3 FirstName3

In [245]:
### Import all the Python libraries you are going to use
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from typing import Callable

In [246]:
### Fix random seed for reproducibility
np.random.seed(10)

# Exercise 1. Dataset features generation.

In [247]:
# Data set params
age_lower_bound = 18
age_upper_bound = 80.000001

income_lower_bound = 1
income_upper_bound = 15.000001

p_self_emplyed = .1

# train set power
m = int(2e4)
# test set power
n = int(1e4)

feature_headers = ['x_1', 'x_2', 'x_3']

# Age ,income and employment data
data = pd.DataFrame((np.random.uniform(age_lower_bound,age_upper_bound, m+n),
                     np.random.uniform(income_lower_bound,income_upper_bound, m+n),
                     np.random.binomial(1, p_self_emplyed, size = m+n))).transpose()
data.columns=feature_headers

# Exercise 2. Dataset labels generation.

In [248]:
# Define logistic function
def logistic(z:float) -> float:
    return 1 / (1 + np.exp(-z))

# Define p1 parameters and function
p1_intecept = -13.3
p1_coeff = np.array([0.33,-3.5,3])

def p1(intecept: float, coeff: float, row:float) -> float:
    z = intecept + np.dot(coeff, row)
    return logistic(z)

# Define p2 parameters and function
p2_intecept = -5
p2_coeff = np.array([10,-1.1,1])
age_lower_bound = 25
age_upper_bound = 75

# define arbitrary indicator function
def indic(low_bound: float, up_bound: float) -> Callable[[float], int]:
    """ Indicator function generator with specified lower boundary
        and upper boundary

    Args:
        low_bound (float): lower boundary for indicator function
        up_bound (float): upper boundary for indicator function

    Returns:
        function: returning 1 below lower boundary, 1 above upper boundary
                  and 0 otherwise
    """
    assert low_bound <= up_bound, 'lower boundary has to be less or equal to upper boundary'
    def inner_function(x):
        return 1 if x < low_bound else -1 if x > up_bound else 0
    return inner_function

def p2(intecept: float, coeff: float, row:float)-> float:
    # define indicator function according to the requirement
    p2_indicator = indic(age_lower_bound, age_upper_bound)
    
    # transform 'x_1'
    row_trasformed = row.copy()
    row_trasformed.loc['x_1'] = p2_indicator(row_trasformed.loc['x_1'])

    # apply scalar product to the transformation
    z = intecept + np.dot(coeff, row_trasformed)
    
    return logistic(z)

In [249]:
df_prob = pd.DataFrame()
df_prob['p_1'] = data.apply(lambda row: p1(p1_intecept, p1_coeff, row), axis = 1)
df_prob['p_2'] = data.apply(lambda row: p2(p2_intecept, p2_coeff, row), axis = 1)

In [264]:
df_y = pd.DataFrame()
for i, col in enumerate(df_prob.columns):
    df_y['y_'+str(i+1)] = np.random.binomial(1,df_prob[col])

print(df_y.describe(),df_y.value_counts(['y_1']), df_y.value_counts(['y_2']), sep='\n')

                y_1           y_2
count  30000.000000  30000.000000
mean       0.049333      0.030067
std        0.216567      0.170774
min        0.000000      0.000000
25%        0.000000      0.000000
50%        0.000000      0.000000
75%        0.000000      0.000000
max        1.000000      1.000000
y_1
0      28520
1       1480
Name: count, dtype: int64
y_2
0      29098
1        902
Name: count, dtype: int64


# Exercise 3. Model implementations.

### 3a) Logistic regression (LR)

In [266]:
from sklearn.linear_model import LogisticRegression # logistic regression model
from sklearn.metrics import log_loss # cross-entropy

# Implement and train a logistic regression model
# You can use LogisticRegression() from sklearn.linear_model (see the notebook "Project 1 - Sandbox.ipynb")
# For more information, see: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

# Attention! Set the argument penalty=None to implement a logistic regression without regularization

In [None]:
# Compute the cross-entropy loss on the training and test data.

### 3b) Neural network (NN)

In [270]:
import keras
# Implement and train a neural network model
# You can use Keras (see the notebook "Project 1 - Credit risk - Sandbox")
# For more information, see: https://keras.io/getting_started/

In [None]:
# Compute the cross-entropy loss on the training and test data.

### 3c) ROC curves and AUC scores

In [None]:
# Plot the ROC curves and compute the AUC scores
# You can use roc_auc_score and roc_curve from sklearn.metrics

# Exercise 4. Comparison of lending strategies

In [None]:
# Implement a function to compute the Value at Risk (VaR) at level alpha on the vector x

def var(x, alpha):
    # Write your code here and return your VaR(alpha) estimate
    return None

In [None]:
# Implement strategy (i), plot the P&L histogram and compute VaR(95%) of the losses

# Implement strategy (ii), plot the P&L histogram and compute VaR(95%) of the losses

# Implement strategy (iii), plot the P&L histogram and compute VaR(95%) of the losses