# Linear Algebra Project – Sure Tomorrow Insurance
This notebook uses the **uploaded dataset** located at `/mnt/data/insurance_us.csv`.

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import sklearn.linear_model
import sklearn.metrics
import sklearn.neighbors
import sklearn.preprocessing
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score, mean_squared_error, r2_score
from IPython.display import display

## Load Uploaded Data

In [5]:
df = pd.read_csv('/datasets/insurance_us.csv')

# Strip whitespace from column names first
df.columns = df.columns.str.strip()

# Rename columns
df = df.rename(columns={
    'Gender': 'gender',
    'Age': 'age',
    'Salary': 'income',
    'Family members': 'family_members',
    'Insurance benefits': 'insurance_benefits'
})

# Verify the rename worked
print("Column names after rename:", list(df.columns))
display(df.head())
df.info()

Column names after rename: ['gender', 'age', 'income', 'family_members', 'insurance_benefits']


Unnamed: 0,gender,age,income,family_members,insurance_benefits
0,1,41.0,49600.0,1,0
1,0,46.0,38000.0,1,1
2,0,29.0,21000.0,0,0
3,0,21.0,41700.0,2,0
4,1,28.0,26100.0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   gender              5000 non-null   int64  
 1   age                 5000 non-null   float64
 2   income              5000 non-null   float64
 3   family_members      5000 non-null   int64  
 4   insurance_benefits  5000 non-null   int64  
dtypes: float64(2), int64(3)
memory usage: 195.4 KB


In [6]:
df.describe(include='all')

Unnamed: 0,gender,age,income,family_members,insurance_benefits
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,0.499,30.9528,39916.36,1.1942,0.148
std,0.500049,8.440807,9900.083569,1.091387,0.463183
min,0.0,18.0,5300.0,0.0,0.0
25%,0.0,24.0,33300.0,0.0,0.0
50%,0.0,30.0,40200.0,1.0,0.0
75%,1.0,37.0,46600.0,2.0,0.0
max,1.0,65.0,79000.0,6.0,5.0


## Task 1 — kNN Similarity Search

In [7]:
feature_names = ['gender', 'age', 'income', 'family_members']

if df['gender'].dtype == 'object':
    df['gender'] = df['gender'].map({'female': 0, 'male': 1})

scaler = sklearn.preprocessing.MaxAbsScaler().fit(df[feature_names])
df_scaled = df.copy()
df_scaled[feature_names] = scaler.transform(df[feature_names])

display(df_scaled.head())

Unnamed: 0,gender,age,income,family_members,insurance_benefits
0,1.0,0.630769,0.627848,0.166667,0
1,0.0,0.707692,0.481013,0.166667,1
2,0.0,0.446154,0.265823,0.0,0
3,0.0,0.323077,0.527848,0.333333,0
4,1.0,0.430769,0.33038,0.0,0


In [8]:
def get_knn(df_local, n, k, metric):
    nbrs = sklearn.neighbors.NearestNeighbors(
        n_neighbors=k,
        metric=metric
    ).fit(df_local[feature_names])
    distances, idx = nbrs.kneighbors([
        df_local.iloc[n][feature_names]
    ], return_distance=True)

    return pd.concat([
        df_local.iloc[idx[0]],
        pd.DataFrame(distances.T, index=idx[0], columns=['distance'])
    ], axis=1)

In [9]:
n = 0
k = 5
for data, name in [(df, 'unscaled'), (df_scaled, 'scaled')]:
    for metric in ['euclidean', 'manhattan']:
        print(f'=== {name} | {metric} ===')
        display(get_knn(data, n, k, metric))
        print()

=== unscaled | euclidean ===


Unnamed: 0,gender,age,income,family_members,insurance_benefits,distance
0,1,41.0,49600.0,1,0,0.0
2022,1,41.0,49600.0,0,0,1.0
1225,0,42.0,49600.0,0,1,1.732051
4031,1,44.0,49600.0,2,1,3.162278
3424,0,38.0,49600.0,0,0,3.316625



=== unscaled | manhattan ===


Unnamed: 0,gender,age,income,family_members,insurance_benefits,distance
0,1,41.0,49600.0,1,0,0.0
2022,1,41.0,49600.0,0,0,1.0
1225,0,42.0,49600.0,0,1,3.0
4031,1,44.0,49600.0,2,1,4.0
3424,0,38.0,49600.0,0,0,5.0



=== scaled | euclidean ===


Unnamed: 0,gender,age,income,family_members,insurance_benefits,distance
0,1.0,0.630769,0.627848,0.166667,0,0.0
2689,1.0,0.630769,0.634177,0.166667,0,0.006329
133,1.0,0.615385,0.636709,0.166667,0,0.017754
4869,1.0,0.646154,0.637975,0.166667,1,0.018418
3275,1.0,0.646154,0.651899,0.166667,1,0.02855



=== scaled | manhattan ===


Unnamed: 0,gender,age,income,family_members,insurance_benefits,distance
0,1.0,0.630769,0.627848,0.166667,0,0.0
2689,1.0,0.630769,0.634177,0.166667,0,0.006329
133,1.0,0.615385,0.636709,0.166667,0,0.024245
4869,1.0,0.646154,0.637975,0.166667,1,0.025511
3365,1.0,0.630769,0.596203,0.166667,0,0.031646





## Task 2 — Classification (Predict Any Benefits)

In [10]:
df['insurance_benefits_received'] = (df['insurance_benefits'] > 0).astype(int)
display(df['insurance_benefits_received'].value_counts())


0    4436
1     564
Name: insurance_benefits_received, dtype: int64

In [11]:
def eval_classifier(y_true, y_pred):
    print('F1:', f1_score(y_true, y_pred))
    display(pd.DataFrame(confusion_matrix(y_true, y_pred),
                         index=['true_0','true_1'], columns=['pred_0','pred_1']))

def rnd_model_predict(P, size, seed=42):
    rng = np.random.default_rng(seed)
    return (rng.random(size) < P).astype(int)

In [12]:
P_actual = df['insurance_benefits_received'].mean()
for P in [0, P_actual, 0.5, 1]:
    print('=== Dummy Model P=', P, '===')
    pred = rnd_model_predict(P, len(df))
    eval_classifier(df['insurance_benefits_received'], pred)
    print()

=== Dummy Model P= 0 ===
F1: 0.0


Unnamed: 0,pred_0,pred_1
true_0,4436,0
true_1,564,0



=== Dummy Model P= 0.1128 ===
F1: 0.11367673179396091


Unnamed: 0,pred_0,pred_1
true_0,3938,498
true_1,500,64



=== Dummy Model P= 0.5 ===
F1: 0.1704728208427147


Unnamed: 0,pred_0,pred_1
true_0,2156,2280
true_1,299,265



=== Dummy Model P= 1 ===
F1: 0.20273184759166066


Unnamed: 0,pred_0,pred_1
true_0,0,4436
true_1,0,564





In [13]:
X = df[feature_names].to_numpy()
y = df['insurance_benefits_received'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=12345, stratify=y)

scaler_c = sklearn.preprocessing.MaxAbsScaler().fit(X_train)
X_train_s = scaler_c.transform(X_train)
X_test_s = scaler_c.transform(X_test)

for k in [3,5,7,9]:
    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train_s, y_train)
    pred = clf.predict(X_test_s)
    print('k=', k)
    eval_classifier(y_test, pred)
    print()

k= 3
F1: 0.9520958083832336


Unnamed: 0,pred_0,pred_1
true_0,1325,6
true_1,10,159



k= 5
F1: 0.9113149847094801


Unnamed: 0,pred_0,pred_1
true_0,1322,9
true_1,20,149



k= 7
F1: 0.9090909090909091


Unnamed: 0,pred_0,pred_1
true_0,1326,5
true_1,24,145



k= 9
F1: 0.9062500000000001


Unnamed: 0,pred_0,pred_1
true_0,1325,6
true_1,24,145





## Task 3 — Linear Regression

In [14]:
class MyLinearRegression:
    def __init__(self): self.weights = None
    def fit(self, X, y):
        X2 = np.append(np.ones((len(X),1)), X, axis=1)
        self.weights = np.linalg.inv(X2.T @ X2) @ (X2.T @ y)
    def predict(self, X):
        X2 = np.append(np.ones((len(X),1)), X, axis=1)
        return X2 @ self.weights

In [15]:
def eval_regressor(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    print('RMSE:', rmse)
    print('R^2:', r2_score(y_true, y_pred))

In [16]:
Xr = df[feature_names].to_numpy()
yr = df['insurance_benefits'].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(Xr, yr, test_size=0.3, random_state=12345)

mylr = MyLinearRegression(); mylr.fit(X_train, y_train)
pred_my = mylr.predict(X_test)
print('Custom Regression:')
eval_regressor(y_test, pred_my)

print('\nSklearn Regression:')
lr = sklearn.linear_model.LinearRegression()
lr.fit(X_train, y_train)
pred_sk = lr.predict(X_test)
eval_regressor(y_test, pred_sk)

Custom Regression:
RMSE: 0.3435565089137969
R^2: 0.4305278542485148

Sklearn Regression:
RMSE: 0.3435565089137964
R^2: 0.4305278542485165


## Task 4 — Data Obfuscation Using Matrix P

In [18]:
X_pn = df[feature_names].to_numpy()

rng = np.random.default_rng(seed=42)
P = rng.random((4,4))
print('det(P)=', np.linalg.det(P))

det(P)= 0.24339135998015463


In [19]:
X_obf = X_pn @ P
df_obf = pd.DataFrame(X_obf, columns=feature_names)
display(df_obf.head())

Unnamed: 0,gender,age,income,family_members
0,6359.715273,22380.404676,18424.090742,46000.69669
1,4873.294065,17160.36703,14125.780761,35253.455773
2,2693.117429,9486.397744,7808.83156,19484.860631
3,5345.603937,18803.227203,15479.148373,38663.061863
4,3347.176735,11782.829283,9699.998942,24211.273378


In [20]:
P_inv = np.linalg.inv(P)
X_rec = X_obf @ P_inv
display(pd.DataFrame(X_rec, columns=feature_names).head())

Unnamed: 0,gender,age,income,family_members
0,1.0,41.0,49600.0,1.0
1,1.679528e-12,46.0,38000.0,1.0
2,-6.230214e-13,29.0,21000.0,-2.030327e-13
3,-1.089078e-12,21.0,41700.0,2.0
4,1.0,28.0,26100.0,-1.524242e-12


In [21]:
y_num = df['insurance_benefits'].to_numpy()

X_train_o, X_test_o, y_train_o, y_test_o = train_test_split(X_pn, y_num, test_size=0.3, random_state=12345)
X_obf_all = X_pn @ P
X_train_obf, X_test_obf, y_train_obf, y_test_obf = train_test_split(X_obf_all, y_num, test_size=0.3, random_state=12345)

lr_orig = MyLinearRegression(); lr_orig.fit(X_train_o, y_train_o)
pred_o = lr_orig.predict(X_test_o)
print('Original Features:')
eval_regressor(y_test_o, pred_o)

lr_obf = MyLinearRegression(); lr_obf.fit(X_train_obf, y_train_obf)
pred_x = lr_obf.predict(X_test_obf)
print('\nObfuscated Features:')
eval_regressor(y_test_obf, pred_x)

Original Features:
RMSE: 0.3435565089137969
R^2: 0.4305278542485148

Obfuscated Features:
RMSE: 0.34355650873044885
R^2: 0.4305278548563426


## Conclusions
- kNN similarity improves significantly when scaling is used.
- kNN classifier performs better than dummy baselines.
- Custom linear regression matches sklearn.
- Obfuscation with matrix P protects personal data while keeping model quality unchanged.