In [99]:
#pip install imbalanced-learn


In [110]:
# Importing necessary stuff for Linear Regression

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.metrics import accuracy_score
from numpy import array 

In [101]:
red = pd.read_csv('winequality-red.csv', delimiter=";")
white = pd.read_csv('winequality-white.csv', delimiter=";")
wine = pd.concat([red, white], ignore_index=True) 

Testing Multiple Linear Regression (Baseline)

In [108]:
#### TESTING WITH ALL VARIABLES

# Resplit the Train/test
x_train2, x_test2, y_train2, y_test2 = train_test_split(wine.drop("quality", axis=1), wine["quality"], test_size = 0.25, random_state = 1)

lin_reg = LinearRegression()

# Fitting the model on training data
lin_reg.fit(x_train2, y_train2)

# Making predictions
y_pred2 = lin_reg.predict(x_test2)

# # Making predictions (training set)
# y_pred2_train = lin_reg.predict(x_train2)


# Calculate the R-squared value (without cross validation)
r_squared = r2_score(y_test2, y_pred2)
print('R-squared:', r_squared)
result_1 = r_squared

# # Cross-validation on the training data
# cv_scores = cross_val_score(lin_reg, x_train2, y_train2, cv=5)

# # Calculate the mean R-squared value across all folds
# cv_mean_r_squared = cv_scores.mean()
# print('CV Mean R-squared:', cv_mean_r_squared)

# Calculate the RMSE across all folds
RMSE = (np.sqrt(metrics.mean_squared_error(y_test2, y_pred2)))
print("RMSE:", RMSE )
result_2 = RMSE

R-squared: 0.28289382549149344
RMSE: 0.725458107754197


In [111]:
# print('Accuracy on training set: %.2f' % accuracy_score(y_train2, y_pred2_train))
# print('Accuracy on test set: %.2f' % accuracy_score(y_test2, y_pred2))

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

Testing Multiple Linear Regression (With Cross Validation)

In [103]:
#### TESTING WITH ALL VARIABLES

# Resplit the Train/test
x_train2, x_test2, y_train2, y_test2 = train_test_split(wine.drop("quality", axis=1), wine["quality"], test_size = 0.25, random_state = 1)

lin_reg = LinearRegression()

# Fitting the model on training data
lin_reg.fit(x_train2, y_train2)

# Making predictions
y_pred2 = lin_reg.predict(x_test2)

# Cross-validation on the training data
cv_scores = cross_val_score(lin_reg, x_train2, y_train2, cv=5, scoring='r2')

# Calculate the mean R-squared value across all folds
cv_mean_r_squared = cv_scores.mean()
print('CV Mean R-squared:', cv_mean_r_squared)
result_3 = cv_mean_r_squared

# Cross-validation on the training data
cv_scores = cross_val_score(lin_reg, x_train2, y_train2, cv=5, scoring='neg_root_mean_squared_error')

# Calculate the mean R-squared value across all folds
cv_mean_r_squared = cv_scores.mean()
print('CV Mean RMSE:', abs(cv_mean_r_squared))
result_4 = abs(cv_mean_r_squared)

CV Mean R-squared: 0.2874356675185207
CV Mean RMSE: 0.7407030724735125


Testing Multiple Linear Regression (Limited Features using 'SelectKBest')(CV Implemented)

In [104]:
# Identifying the Best Features

X = wine[['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density','pH', 'sulphates', 'alcohol']]
Y = wine['quality']

select = SelectKBest(score_func=f_regression, k=5)

z = select.fit_transform(X, Y)

filter = select.get_support()
features = array(X.columns)

print("All features:")
print(features)
print("")

print("Selected best 5:")
print(features[filter])



All features:
['fixed acidity' 'volatile acidity' 'citric acid' 'residual sugar'
 'chlorides' 'free sulfur dioxide' 'total sulfur dioxide' 'density' 'pH'
 'sulphates' 'alcohol']

Selected best 5:
['volatile acidity' 'citric acid' 'chlorides' 'density' 'alcohol']


In [105]:
#### TESTING WITH LIMITED VARIABLES

X = wine[['volatile acidity', 'citric acid', 'chlorides', 'density', 'alcohol']]
Y = wine['quality']

# Resplit the Train/test
x_train2, x_test2, y_train2, y_test2 = train_test_split(X, Y, test_size = 0.25, random_state = 1)

lin_reg = LinearRegression()

# Fitting the model on training data
lin_reg.fit(x_train2, y_train2)

# Making predictions
y_pred2 = lin_reg.predict(x_test2)

# Cross-validation on the training data
cv_scores = cross_val_score(lin_reg, x_train2, y_train2, cv=5, scoring='r2')

# Calculate the mean R-squared value across all folds
cv_mean_r_squared = cv_scores.mean()
print('CV Mean R-squared:', cv_mean_r_squared)
result_5 = cv_mean_r_squared

# Cross-validation on the training data
cv_scores = cross_val_score(lin_reg, x_train2, y_train2, cv=5, scoring='neg_root_mean_squared_error')

# Calculate the mean R-squared value across all folds
cv_mean_r_squared = cv_scores.mean()
print('CV Mean RMSE:', abs(cv_mean_r_squared))
result_6 = abs(cv_mean_r_squared)

CV Mean R-squared: 0.26766395806959153
CV Mean RMSE: 0.7509750007152824


Testing Multiple Linear Regression (SMOTE Implemented)(Limited Features using 'SelectKBest')(CV Implemented)

In [106]:
#### TESTING WITH LIMITED VARIABLES

X = wine[['volatile acidity', 'citric acid', 'chlorides', 'density', 'alcohol']]
Y = wine['quality']

smote = SMOTE(k_neighbors=3, random_state=42)
x_smote, y_smote = smote.fit_resample(X, Y)

# Resplit the Train/test
x_train2, x_test2, y_train2, y_test2 = train_test_split(x_smote, y_smote, test_size = 0.25, random_state = 1)

lin_reg = LinearRegression()

# Fitting the model on training data
lin_reg.fit(x_train2, y_train2)

# Making predictions
y_pred2 = lin_reg.predict(x_test2)

# Cross-validation on the training data
cv_scores = cross_val_score(lin_reg, x_train2, y_train2, cv=5, scoring='r2')

# Calculate the mean R-squared value across all folds
cv_mean_r_squared = cv_scores.mean()
print('CV Mean R-squared:', cv_mean_r_squared)
result_7 = cv_mean_r_squared

# Cross-validation on the training data
cv_scores = cross_val_score(lin_reg, x_train2, y_train2, cv=5, scoring='neg_root_mean_squared_error')

# Calculate the mean R-squared value across all folds
cv_mean_r_squared = cv_scores.mean()
print('CV Mean RMSE:', abs(cv_mean_r_squared))
result_8 = abs(cv_mean_r_squared)

CV Mean R-squared: 0.45591928547874233
CV Mean RMSE: 1.4803079134060204


Results

In [107]:
print('Test 1: Baseline')
print('R-squared:', result_1)
print('RMSE:', result_2)
print('')

print('Test 2: w/Cross Validation')
print('R-squared:', result_3)
print('RMSE:', result_4)
print('')

print('Test 3: w/Cross Validation & Feature Selection')
print('R-squared:', result_5)
print('RMSE:', result_6)
print('')

print('Test 4: w/Cross Validation & Feature Selection & SMOTE')
print('R-squared:', result_7)
print('RMSE:', result_8)
print('')

Test 1: Baseline
R-squared: 0.28289382549149344
RMSE: 0.725458107754197

Test 2: w/Cross Validation
R-squared: 0.2874356675185207
RMSE: 0.7407030724735125

Test 3: w/Cross Validation & Feature Selection
R-squared: 0.26766395806959153
RMSE: 0.7509750007152824

Test 4: w/Cross Validation & Feature Selection & SMOTE
R-squared: 0.45591928547874233
RMSE: 1.4803079134060204

