In [41]:
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error, classification_report

In [None]:
labelEncoder = LabelEncoder()
df = pl.read_csv("../data/data.csv")
adequated_diagnosis = pl.Series("diagnosis_int",labelEncoder.fit_transform(df.select('diagnosis')))
df.insert_column(2, adequated_diagnosis)
df.head()

In [None]:
df_cut = df["diagnosis_int","radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean","compactness_mean","concavity_mean","concave points_mean","symmetry_mean","fractal_dimension_mean"]
correlation = df_cut.corr()
fig, ax = plt.subplots(figsize=(12,12)) 
sns.heatmap(correlation, annot=True, ax=ax)

In [None]:
sns.pairplot(df.to_pandas(),vars=["radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean","compactness_mean","concavity_mean","concave points_mean","symmetry_mean","fractal_dimension_mean"],hue="diagnosis")

In [47]:
y = df["id", "diagnosis", "diagnosis_int"]
X = df["radius_mean","texture_mean","perimeter_mean","area_mean","smoothness_mean","compactness_mean","concavity_mean","concave points_mean","symmetry_mean","fractal_dimension_mean"]

X_train, X_test, y_train, y_test = train_test_split(X, y.get_column("diagnosis_int"), test_size=0.2, random_state=42)

In [48]:
rlog_model = make_pipeline(MinMaxScaler(), LogisticRegression())
rlog_model.fit(X_train, y_train)

y_pred_rlog = rlog_model.predict(X_test)
rlog_model.score(X_test, y_test)

0.956140350877193

In [49]:
# Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred_rlog)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.96


In [56]:
# Calculate Mean Absolute Error
mae = mean_absolute_error(y_test, y_pred_rlog)
print(f"Mean Absolute Error (MAE): {mae}")

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred_rlog)
print(f"Mean Squared Error (MSE): {mse}")

# Calculate Mean Absolute Percentage Error
mape = mean_absolute_percentage_error(y_test, y_pred_rlog)
print(f"Mean Absolute Percentage Error (MAPE): {mape}")

# Calculate R-squared (coefficient of determination)
r2 = r2_score(y_test, y_pred_rlog)
print(f"R-squared (R2): {r2}")

Mean Absolute Error (MAE): 0.043859649122807015
Mean Squared Error (MSE): 0.043859649122807015
Mean Absolute Percentage Error (MAPE): 0.043859649122807015
R-squared (R2): 0.8132983950212905


In [54]:
rf_model = make_pipeline(MinMaxScaler(), RandomForestRegressor(n_estimators=100, random_state=42, criterion='absolute_error'))
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_model.score(X_test, y_test)

0.8379019325253849

In [55]:
# Calculate evaluation metrics for the RandomForest model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
print(f"Random Forest Mean Absolute Error (MAE): {mae_rf}")

mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f"Random Forest Mean Squared Error (MSE): {mse_rf}")

mape_rf = mean_absolute_percentage_error(y_test, y_pred_rf)
print(f"Random Forest Mean Absolute Percentage Error (MAPE): {mape_rf}")

r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest R-squared (R2): {r2_rf}")


Random Forest Mean Absolute Error (MAE): 0.10324561403508771
Random Forest Mean Squared Error (MSE): 0.038079824561403505
Random Forest Mean Absolute Percentage Error (MAPE): 282462608207886.44
Random Forest R-squared (R2): 0.8379019325253849


In [53]:
# Classification Report
print(classification_report(y_test, y_pred_rlog))

              precision    recall  f1-score   support

           0       0.93      1.00      0.97        71
           1       1.00      0.88      0.94        43

    accuracy                           0.96       114
   macro avg       0.97      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114

