In [None]:
#libraries
import pandas as pd
from tabulate import tabulate
import numpy as np
import matplotlib.pyplot as plt
import math
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.metrics import mean_squared_error


Import Data

In [None]:



def read_csv_or_fail(file_path):
    try:
        df = pd.read_csv(file_path)
        return df
    except FileNotFoundError:
        print(f"Cannot read the data from '{file_path}' - File not found.")
    except pd.errors.EmptyDataError:
        print(f"Cannot read the data from '{file_path}' - Empty file.")
    except pd.errors.ParserError:
        print(f"Cannot read the data from '{file_path}' - Error parsing the CSV file.")
    return None

data = read_csv_or_fail("mv_test.csv")
data["mean"] = data[["score1","score2"]].mean(axis=1)


if data is not None:
    data.drop('ID', axis=1, inplace=True)
    print(data.head())
else:
    print("Data load failed.")


Question 1

In [None]:
#Creating small function to calculating divergence score
def calculate_divergence(df: pd.DataFrame):
    mu_good, sigma_good = df.loc[df['bad'] == 0, 'score1'].mean(), df.loc[df['bad'] == 0, 'mean'].std()
    mu_bad, sigma_bad = df.loc[df['bad'] == 1, 'mean'].mean(), df.loc[df['bad'] == 1, 'mean'].std()
    print(2*(mu_good - mu_bad)**2/(sigma_good**2 + sigma_bad**2))

calculate_divergence(data)


Question 2

In [None]:

#Setting bin sizes with starting and ending bins. Can be modified for different results
starting_bin = 500
ending_bin = 820
bin_increment = 20

In [None]:
def create_table(df, starting_bin, ending_bin, bin_increment, score=str, total_number=str, bad_number=str):
  df = pd.DataFrame({total_number: df[score],
                        bad_number: df["bad"]})

  my_list = []
  my_list.append(0)
  for i in range(starting_bin, ending_bin, bin_increment):
    my_list.append(i)
  my_list.append(np.inf)

  result = df.groupby(pd.cut(df[total_number], my_list, right=False)).apply(lambda x: pd.Series({
      total_number: x[total_number].count(),
      bad_number: x[bad_number].astype(bool).sum(),
  }))

  return result

In [None]:
def concat_tables(df, number_of_scores):
    scores = []
    for i in range(1, number_of_scores+1):
      scores.append('score' + str(i))
    dfs = []
    for score in scores:
      df_temp = create_table(df, starting_bin, ending_bin, bin_increment, score, f'Total # ({score})', f'Bads # ({score})')
      dfs.append(df_temp)
    df_concat = pd.concat(dfs, axis=1)
    df_concat.index.name="Score Bands"
    return df_concat
df_concat = concat_tables(data, 2)

Question 3

In [None]:
table_1 = create_table(data, starting_bin, ending_bin, bin_increment, "score1", 'Total # (S1)', 'Bads # (S1)')
table_2 = create_table(data, starting_bin, ending_bin, bin_increment, "score2", 'Total # (S2)', 'Bads # (S2)')

df_concat = pd.concat([table_1, table_2], axis=1)
df_concat

In [None]:


#Changing pandas series to float to calculate the log odds of Score 1 and Score 2
def calculate_log_odds(df, total_number, bads_number, log_odds_number=str):
  question3_df_float = df_concat.astype(float)

  question3_df_float[log_odds_number]= (question3_df_float[total_number] - question3_df_float[bads_number] / question3_df_float[total_number])
  question3_df_float[log_odds_number]= question3_df_float[log_odds_number].apply(lambda x: math.log(x, 10))
  return question3_df_float


question3_df_float_1 = calculate_log_odds(df_concat,"Total # (S1)", "Bads # (S1)", "log_odds_s1")
question3_df_float_2 = calculate_log_odds(df_concat,"Total # (S2)", "Bads # (S2)", "log_odds_s2")
question3_df_float = pd.merge(question3_df_float_1, question3_df_float_2, on=question3_df_float_1.index.name)
column = question3_df_float.pop("log_odds_s1")
question3_df_float.insert(desired_index, "log_odds_s1", column)
question3_df_float

In [None]:

def get_graph(df, log_odds, color=str):
  x = question3_df_float.index.codes.astype(float)
  y1 = question3_df_float[log_odds]

  coeffs1 = np.polyfit(x, y1, 1)

  line1 = np.poly1d(coeffs1)

  b1_line1, b0_line1 = line1.coef

  plt.scatter(x, y1, label='Score 1', color=color)

  plt.plot(x, line1(x), color="blue")

  string_list = [f"<{i}" if i<500 else f"{i}-{i+20}" if i<800 else f">{i}" for i in range(575, 850, 25)]
  string_list.insert(0, "< 500")

  for i in range(len(string_list)):
      if i != 0 and i != len(string_list) - 1:
          string_list[i] = "[" + string_list[i] + ")"
  number_list = [i for i in range(0, 12)]

  plt.legend()
  plt.xlabel('Score Band')
  plt.ylabel('Log Odds')
  plt.ylim(0, 7)
  plt.yticks(range(0, 7))
  plt.xticks(number_list, string_list, rotation=45, ha='right')
  plt.grid(True)
  fig = plt.gcf()
  fig.set_size_inches(17,8)

  # Show the plot
  plt.show()

In [None]:

x = question3_df_float.index.codes.astype(float)
y1 = question3_df_float['log_odds(score1)']
y2 = question3_df_float['log_odds(score2)']


# Calculate the coefficients for the lines of best fit for each set of data
coeffs1 = np.polyfit(x, y1, 1)
coeffs2 = np.polyfit(x, y2, 1)

# Create functions for each line of best fit based on the calculated coefficients
line1 = np.poly1d(coeffs1)
line2 = np.poly1d(coeffs2)

#finding b0 and b1 from np polyfit
b1_line1, b0_line1 = line1.coef
b1_line2, b0_line2 = line2.coef

# Plot the scatter points for both sets of data
plt.scatter(x, y1, label='Score 1', color="blue")
plt.scatter(x, y2, label='Score 2', color="green")

# Plot each line of best fit
plt.plot(x, line1(x), color="blue")
plt.plot(x, line2(x), color="green")

#Creating string list of the score bands along with their positions on x-axis
string_list = [f"<{i}" if i<500 else f"{i}-{i+20}" if i<800 else f">{i}" for i in range(500, 820, 20)]
string_list.insert(0, "< 500")
number_list = [i for i in range(0, 17)]

# Add a legend and labels for the axes
plt.legend()
plt.xlabel('Score Band')
plt.ylabel('Log Odds')
plt.ylim(0, 7)
plt.yticks(range(0, 7))
plt.xticks(number_list, string_list)
plt.grid(True)
fig = plt.gcf()
fig.set_size_inches(15,8)

# Show the plot
plt.show()

In [None]:
#Getting various stats of each score
def get_stats(independent, dependent, score):
  slope, intercept, r_value, p_value, std_err = stats.linregress(independent, dependent)
  print("\033[4m" + score + "\033[0m")
  print("Slope:", slope)
  print("Intercept:", intercept)
  print("R-squared:", r_value**2)
  print("P-value:", p_value)
  print("std err:", std_err)
  print("\n")

get_stats(x, y1, "Score 1")
get_stats(x, y2, "Score 2")

In [None]:
#organizing data for table, calculate PDO which is the natural logarithm of base 2 / beta 1's for each score
table_data = [['Variable', 'Score 1', 'Score 2'],
        ['Intercept β0', round(b0_line1, 2), round(b0_line2, 2)],
        ['Coefficient β1', round(b1_line1, 2), round(b1_line2, 2)],
        ['PDO', round(math.log(2)/b1_line1, 2), round(math.log(2)/b1_line2, 2)]]

# set the colors for the table
colors = [['green'] * 3]  # set top row to light gray
for i in range(1, len(table_data)):
    colors.append(['w'] * 3)  # set all other rows to white

# create the table plot
fig, ax = plt.subplots()
ax.axis('off')  # remove axes

# plot the table using table function
table = ax.table(cellText=table_data, cellColours=colors, loc='center')

# set table properties
table.set_fontsize(14)
table.scale(1, 2)  # set table size

# display the plot
plt.show()

Question 4

In [None]:


#Rename for ease of use for this function
x = question3_df_float.index.codes.astype(float)
y1 = question3_df_float['log_odds(score1)']
y2 = question3_df_float['log_odds(score2)']

#create function which prints beta 0 and beta 1 for each score
def calculate_betas(x, y, score):
  X_mean = sum(x) / len(x)
  Y_mean = sum(y) / len(y)

  dev_X = [i - X_mean for i in x]
  dev_Y = [j - Y_mean for j in y]

  SS_xy = sum([dev_X[i] * dev_Y[i] for i in range(len(x))])
  SS_xx = sum([(j - X_mean)**2 for j in x])

  beta1 = SS_xy / SS_xx
  beta0 = Y_mean - beta1 * X_mean

  print("\033[4m" + score + "\033[0m", ": β0 =", round(beta0, 2), "and β1 =", round(beta1, 2))
  print("\n")

calculate_betas(x, y1, "Score 1")
calculate_betas(x, y2, "Score 2")