In [6]:
from google.colab import files
import io
uploaded = files.upload()

Saving WineQT.csv to WineQT.csv


In [21]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import StackingRegressor


# Load the wine quality data
data = pd.read_csv(io.BytesIO(uploaded['WineQT.csv']))

In [8]:
# Check for missing values
print(data.isnull().sum())


fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
Id                      0
dtype: int64


In [10]:
# Define an Imputer object with mean strategy
imputer = SimpleImputer(strategy="mean")

# Select columns with numerical features (excluding quality)
numerical_features = data.columns[data.dtypes != object]
numerical_features = numerical_features[:-1]  # exclude quality

# Create a ColumnTransformer for imputation
transformer = ColumnTransformer(transformers=[("imputer", imputer, numerical_features)])

# Apply imputation
data = transformer.fit_transform(data)

# Convert back to pandas dataframe
data = pd.DataFrame(data, columns=numerical_features + ["quality"])


In [11]:
# Check for categorical features (object data type)
categorical_features = data.select_dtypes(include=["object"])

# One-Hot Encoding for categorical features (if present)
if not categorical_features.empty:
    encoder = OneHotEncoder(sparse=False)
    transformer = ColumnTransformer(transformers=[("encoder", encoder, categorical_features)])
    data = transformer.fit_transform(data)

    # Update column names after encoding (assuming original names are in categorical_features)
    new_column_names = []
    for col in categorical_features:
        for category in encoder.categories_[0]:
            new_column_names.append(f"{col}_{category}")
    data = pd.DataFrame(data, columns=numerical_features + new_column_names + ["quality"])


In [12]:
# Define a StandardScaler object
scaler = StandardScaler()

# Select all features except quality (target)
features = data.columns[:-1]

# Create a ColumnTransformer for scaling
transformer = ColumnTransformer(transformers=[("scaler", scaler, features)])

# Apply scaling
data = transformer.fit_transform(data)

# Convert back to pandas dataframe
data = pd.DataFrame(data, columns=features + ["quality"])


In [16]:
X = data.drop("quality", axis=1)  # Features
y = data["quality"]  # Target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVR model
svr_model = SVR()
svr_model.fit(X_train, y_train)

# Train Random Forest Regression model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on testing set
svr_predictions = svr_model.predict(X_test)
rf_predictions = rf_model.predict(X_test)

# Evaluate model performance using mean squared error (MSE)
svr_mse = mean_squared_error(y_test, svr_predictions)
rf_mse = mean_squared_error(y_test, rf_predictions)
# Train KNN model
knn_model = KNeighborsRegressor(n_neighbors=5)  # Set number of neighbors (k)
knn_model.fit(X_train, y_train)

# Make predictions on testing set
knn_predictions = knn_model.predict(X_test)

# Evaluate model performance using mean squared error (MSE)
knn_mse = mean_squared_error(y_test, knn_predictions)

print("KNN Mean Squared Error:", knn_mse)

print("SVR Mean Squared Error:", svr_mse)
print("Random Forest Regression Mean Squared Error:", rf_mse)

KNN Mean Squared Error: 0.5503930131004366
SVR Mean Squared Error: 0.5154133634409084
Random Forest Regression Mean Squared Error: 0.2979729257641922


In [20]:
# Define hyperparameter grid for SVR
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['rbf', 'linear'],
}

# Create a GridSearchCV object with SVR model and MSE scoring
grid_search = GridSearchCV(svr_model, param_grid, scoring='neg_mean_squared_error', cv=5)

# Fit the grid search to training data
grid_search.fit(X_train, y_train)

# Get the best model and best parameters
best_svr_model = grid_search.best_estimator_
best_svr_params = grid_search.best_params_

# Make predictions with the best SVR model
best_svr_predictions = best_svr_model.predict(X_test)

# Evaluate performance using MSE
best_svr_mse = mean_squared_error(y_test, best_svr_predictions)

print("Best SVR MSE:", best_svr_mse)
print("Best SVR Parameters:", best_svr_params)

KeyboardInterrupt: 

In [None]:
# Define hyperparameter distribution for Random Forest Regression
param_dist = {
    'n_estimators': range(50, 200, 10),  # Number of trees (uniform distribution)
    'max_depth': range(3, 10),  # Maximum depth (uniform distribution)
}

# Create a RandomizedSearchCV object with Random Forest model and MSE scoring
random_search = RandomizedSearchCV(rf_model, param_dist, scoring='neg_mean_squared_error', cv=5, n_iter=100)

# Fit the randomized search to training data
random_search.fit(X_train, y_train)

# Get the best model and best parameters
best_rf_model = random_search.best_estimator_
best_rf_params = random_search.best_params_

# Make predictions with the best Random Forest model
best_rf_predictions = best_rf_model.predict(X_test)

# Evaluate performance using MSE
best_rf_mse = mean_squared_error(y_test, best_rf_predictions)

print("Best Random Forest MSE:", best_rf_mse)
print("Best Random Forest Parameters:", best_rf_params)


In [None]:
# Define a pipeline with StandardScaler and SVR
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

# Fit the pipeline on training data
pipeline.fit(X_train, y_train)

# Make predictions on testing set
pipeline_predictions = pipeline.predict(X_test)

# Evaluate performance using MSE
pipeline_mse = mean_squared_error(y_test, pipeline_predictions)

print("Pipeline MSE (SVR with scaling):", pipeline_mse)

In [None]:

# Train a Random Forest Regression model (same as before)
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions on testing set
rf_predictions = rf_model.predict(X_test)

# Evaluate performance using MSE
rf_mse = mean_squared_error(y_test, rf_predictions)

print("Random Forest Regression MSE:", rf_mse)


In [None]:
# Define StackingRegressor with base models (SVR and Random Forest)
stacking_regressor = StackingRegressor(estimators=[('svr', SVR()), ('rf', rf_model)])

# Fit the StackingRegressor on training data
stacking_regressor.fit(X_train, y_train)

# Make predictions on testing set
stacking_predictions = stacking_regressor.predict(X_test)

# Evaluate performance using MSE
stacking_mse = mean_squared_error(y_test, stacking_predictions)

print("Stacking MSE:", stacking_mse)