In [45]:
import plotly.express as px
import pandas as pd
import math

In [31]:
def split_data(df: pd.DataFrame, seed: int = 1):
    training_data = df.sample(frac=0.7, random_state=seed)
    testing_data = df.drop(training_data.index)
    return (training_data, testing_data)

In [39]:
def find_linear_regression_line(df: pd.DataFrame, x_column: str, y_column: str):
    numerator = 0
    denominator = 0
    x_mean = df[x_column].mean()
    y_mean = df[y_column].mean()
    for i in df.index:
        numerator += (df[x_column][i]-x_mean)*(df[y_column][i]-y_mean)
        denominator += (df[x_column][i]-x_mean)**2
    slope = numerator/denominator
    intercept = (0, y_mean-slope*x_mean)
    return (intercept, slope)

In [56]:
def apply_logistic_regression(df: pd.DataFrame, x_column: str, y_column: str, slope: float, intercept: float):
    predicted_values = []
    for i in df.index:
        value_odds = 1/(1+math.e**-(intercept[0]+slope*df[x_column][i]))
        predicted_values.append(round(value_odds))
    df["predicted_values"] = predicted_values
    return df

In [66]:
def create_confusion_matrix(df: pd.DataFrame, y_column: str):
    true_positives = []
    true_negatives = []
    false_positives = []
    false_negatives = []
    for i in df.index:
        if df[y_column][i] == df["predicted_values"][i] == 1:
            true_positives.append(i)
        elif df[y_column][i] == df["predicted_values"][i] == 0:
            true_negatives.append(i)
        elif df[y_column][i] != df["predicted_values"][i] == 1:
            false_positives.append(i)
        elif df[y_column][i] != df["predicted_values"][i] == 0:
            false_negatives.append(i)
    confusion_matrix = {"true_positives": true_positives, "true_negatives":true_negatives, "false_positives":false_positives, "false_negatives":false_negatives}
    return confusion_matrix

In [25]:
df = pd.read_csv('insurance.csv')
df.head()

In [33]:
df_train, df_test = split_data(df)
df_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
85,45,1,22.895,2,1,1,21098.55405
191,36,0,26.2,0,0,3,4883.866
107,26,1,30.875,2,0,1,3877.30425
105,20,1,28.025,1,1,1,17560.37975
123,44,1,31.35,1,1,0,39556.4945


In [40]:
intercept, slope = find_linear_regression_line(df, "charges", "smoker")
print(f"Intercept: {intercept}  Slope: {slope}")

Intercept: (0, -0.14201355360844872)  Slope: 2.6738069066713134e-05


In [64]:
n_df = apply_logistic_regression(df, "charges", "smoker", slope, intercept)
n_df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,predicted_values
0,19,0,27.9,0,1,3,16884.924,1
1,18,1,33.77,1,0,2,1725.5523,1
2,28,1,33.0,3,0,2,4449.462,1
3,33,1,22.705,0,0,1,21984.47061,1
4,32,1,28.88,0,0,1,3866.8552,1


In [67]:
m = create_confusion_matrix(n_df, "smoker")
m

{'true_positives': [0,
  11,
  14,
  19,
  23,
  29,
  30,
  34,
  38,
  39,
  49,
  52,
  53,
  55,
  57,
  58,
  64,
  69,
  70,
  82,
  84,
  85,
  86,
  92,
  94,
  98,
  99,
  103,
  105,
  109,
  117,
  123,
  126,
  128,
  142,
  144,
  146,
  153,
  156,
  157,
  158,
  160,
  161,
  175,
  185,
  203,
  207,
  223,
  224,
  235,
  238,
  240,
  244,
  250,
  251,
  252,
  254,
  256,
  259,
  261,
  262,
  263,
  265,
  266,
  271,
  280,
  281,
  288,
  292,
  296,
  297,
  298,
  301,
  312,
  314,
  322,
  327,
  328,
  330,
  331,
  338],
 'true_negatives': [],
 'false_positives': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  12,
  13,
  15,
  16,
  17,
  18,
  20,
  21,
  22,
  24,
  25,
  26,
  27,
  28,
  31,
  32,
  33,
  35,
  36,
  37,
  40,
  41,
  42,
  43,
  44,
  45,
  46,
  47,
  48,
  50,
  51,
  54,
  56,
  59,
  60,
  61,
  62,
  63,
  65,
  66,
  67,
  68,
  71,
  72,
  73,
  74,
  75,
  76,
  77,
  78,
  79,
  80,
  81,
  83,
  87,
  88,
  89,
  90,
 

In [29]:
fig = px.scatter(df, x="charges", y="smoker")
fig.show()

In [70]:
from numpy import random
random.uniform(0,1)

0.6911976791055456