In [94]:

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split


# Ερώτημα 1

In [95]:
df = pd.read_csv('diabetes.csv')
input_df = df[['Pregnancies', 'BloodPressure', 'BMI', 'Age']]
targets_df = df['Glucose']

# Διόρθωση ακυρων τιμών
input_df = input_df.copy()
input_df['BloodPressure'] = input_df['BloodPressure'].replace(0, np.nan)
input_df['BMI'] = input_df['BMI'].replace(0, np.nan)
imputer = SimpleImputer(strategy='median')
input_df[['BloodPressure', 'BMI']] = imputer.fit_transform(input_df[['BloodPressure', 'BMI']])

# Κανονικοποιηση
input_df_normalized = (input_df - input_df.min()) / (input_df.max() - input_df.min())

x_normalized = input_df_normalized.to_numpy()
y = targets_df.to_numpy()

seed = 0
test_size = 0.3

x_train, x_test, y_train, y_test = train_test_split(x_normalized, y, test_size=test_size, random_state=seed)


# Ερώτημα 2

In [96]:
model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = model.score(x_test, y_test)

print("Mean Squared Error: ", mse)
print("Mean Absolute Error: ", mae)

Mean Squared Error:  918.5937680595416
Mean Absolute Error:  23.20608415647075


Η καταλληλοτερη μετρική για την συγκεκριμένη εφαρμογή θεωρώ πως είναι το μέσο τετραγωνικό σφάλμα (MSE), καθώς επηρεάζεται περισσότερο από μεγάλα σφάλματα, που σε μια ιατρική εφαρμογή όπως αυτή θα μπορούσαν να έχουν σοβαρές συνέπειες.

# Ερώτημα 3

In [97]:
lamda_mse_dict = {
    0.2: None,
    0.4: None,
    0.6: None,
    0.8: None,
    1.0: None
}
for l in lamda_mse_dict.keys():
    model3 = Lasso(alpha=l, random_state=seed)
    model3.fit(x_train, y_train)
    y_pred3 = model3.predict(x_test)
    mse3 = mean_squared_error(y_test, y_pred3)
    lamda_mse_dict[l] = mse3

results_df = pd.DataFrame.from_dict(lamda_mse_dict, orient='index', columns=['MSE'])
results_df.reset_index(inplace=True)
results_df.columns = ['λ', 'MSE']
results_df

Unnamed: 0,λ,MSE
0,0.2,913.821983
1,0.4,916.679843
2,0.6,918.462257
3,0.8,926.142164
4,1.0,939.719566


# Ερώτημα 4

In [98]:
coef_df = pd.DataFrame({
    'Feature': input_df.columns,
    'Coefficient': model.coef_
})
coef_df['Absolute Coefficient'] = coef_df['Coefficient'].abs()
coef_df_sorted = coef_df.sort_values(by='Absolute Coefficient', ascending=True)
least_important_feature = coef_df_sorted.iloc[0]['Feature']
print("Least important feature: ", least_important_feature)

input_df_without_pregnancies = input_df_normalized.drop(columns=['Pregnancies'])
x_without_pregnancies = input_df_without_pregnancies.to_numpy()

x_train4, x_test4, y_train4, y_test4 = train_test_split(x_without_pregnancies, y, test_size=test_size,
                                                        random_state=seed)
model4 = LinearRegression()
model4.fit(x_train4, y_train4)
y_pred4 = model4.predict(x_test4)
mse4 = mean_squared_error(y_test4, y_pred4)
print("MSE without 'Pregnancies' feature: ", mse4)
print("MSE with 'Pregnancies' feature: ", mse)

Least important feature:  Pregnancies
MSE without 'Pregnancies' feature:  919.6372349598458
MSE with 'Pregnancies' feature:  918.5937680595416
