In [6]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.datasets import make_friedman1
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
from joblib import dump
wine_train = pd.read_csv("https://cs307.org/lab-04/data/wine-train.csv")
X_train = wine_train.drop("quality", axis=1)
y_train = wine_train["quality"]

In [6]:
# wine_train.describe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
count,4992.0,5197.0,4929.0,4923.0,5197.0,5167.0,5197.0,5197.0,4909.0,4920.0,4953.0,5197.0
mean,7.198978,0.33722,0.320097,5.489793,0.055824,30.584672,115.713392,0.994691,3.216712,0.529799,10.488608,5.814508
std,1.285476,0.16218,0.144776,4.80639,0.03483,17.432838,56.141996,0.003009,0.159396,0.148568,1.192619,0.876648
min,3.8,0.08,0.0,0.6,0.009,1.0,6.0,0.98711,2.74,0.22,8.0,3.0
25%,6.4,0.23,0.25,1.8,0.038,17.0,78.0,0.99234,3.11,0.43,9.5,5.0
50%,7.0,0.29,0.31,3.1,0.047,29.0,118.0,0.9949,3.21,0.51,10.3,6.0
75%,7.6,0.4,0.39,8.2,0.064,41.0,156.0,0.99699,3.32,0.6,11.3,6.0
max,15.6,1.58,1.23,65.8,0.611,146.5,344.0,1.03898,4.01,2.0,14.2,9.0


In [3]:
# Handling categorical and numerical columns separately
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

# Creating transformers for numerical and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combining transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Creating a KNN regressor pipeline
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

param_grid = {
    'regressor__n_neighbors': [5, 7, 9, 11, 13, 15, 17,19, 21, 23, 25, 27, 29, 31],
    'regressor__weights': ['uniform', 'distance'],
    'regressor__metric': ['euclidean', 'manhattan','minkowski', 'chebyshev'],  # Exploring more metrics
    'regressor__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}
grid_search = GridSearchCV(knn_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# Splitting data for cross-validation
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)
y_pred = grid_search.predict(X_test)
test_mae = mean_absolute_error(y_test,y_pred)

# Saving Results
model_filename = 'wine-quality.joblib'
dump(grid_search, model_filename)
best_params, best_score, test_mae

Fitting 5 folds for each of 448 candidates, totalling 2240 fits


({'regressor__algorithm': 'auto',
  'regressor__metric': 'manhattan',
  'regressor__n_neighbors': 15,
  'regressor__weights': 'distance'},
 0.6556372005367318,
 0.49010254787277263)

In [10]:
dt_regressor_adjusted = DecisionTreeRegressor(max_depth=10, min_samples_leaf=5, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Fit the adjusted model to the training data
dt_regressor_adjusted.fit(X_train, y_train)

# Predict on the testing set with the adjusted model
y_pred_adjusted = dt_regressor_adjusted.predict(X_test)

# Calculate the Mean Absolute Error (MAE) with the adjusted model
mae_adjusted = mean_absolute_error(y_test, y_pred_adjusted)

model_filename = 'wine-quality.joblib'
dump(grid_search, model_filename)

mae_adjusted

0.23347988259513022