# Linear Regression on Wine Quality Dataset

This notebook demonstrates a simple linear regression model on the Wine Quality dataset from Kaggle.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
df = pd.read_csv(url, sep=';')
df.head()

In [None]:
# Prepare the data
# df.drop('quality', axis=1) removes the 'quality' column from the dataframe.
# 'quality' is the target variable we want to predict, so we exclude it from the features (X).
# axis=1 specifies that we are dropping a column (not a row).
X = df.drop('quality', axis=1)
y = df['quality']
# Split the data into training and test sets
# train_test_split() randomly splits the data into training and test sets
# X_train and y_train will be used to train the model
# X_test and y_test will be used to evaluate the model's performance
# test_size=0.2 means 20% of the data will be used for testing
# random_state=42 ensures reproducibility of the random split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Add code here to visualize the data
# Distribution of wine quality scores
plt.figure(figsize=(10, 6))
sns.histplot(df['quality'], bins=7, kde=True)
plt.title('Distribution of Wine Quality Scores')
plt.xlabel('Quality Score')
plt.ylabel('Frequency')
plt.show()

# Correlation heatmap
# Calculate correlations between all numeric columns using df.corr()
# df.corr() computes the pairwise correlation of columns using Pearson correlation coefficient
# The correlation coefficient ranges from -1 to 1:
#   1: Perfect positive correlation
#   0: No correlation
#   -1: Perfect negative correlation
# The diagonal is always 1 since each feature perfectly correlates with itself
# The Pearson correlation coefficient measures the strength and direction of the linear relationship
# between two variables. For each pair of features, it shows:
# - How strongly they are related (magnitude from 0 to 1)
# - Whether the relationship is positive or negative (sign)
# - Values closer to 1 indicate strong positive correlation (as one increases, the other increases)
# - Values closer to -1 indicate strong negative correlation (as one increases, the other decreases) 
# - Values close to 0 indicate little to no linear relationship
#
# This helps identify:
# 1. Which features might be most important for predicting wine quality
# 2. Potential multicollinearity (highly correlated features that could affect model performance)
# 3. Redundant features that could potentially be removed

plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()


In [None]:
# Train the model
# Create and train a Linear Regression model
# LinearRegression() creates an instance of the linear regression model
# This model will learn the relationship between the wine features (X_train) 
# and the quality scores (y_train)
# The model finds the best-fit line by minimizing the sum of squared residuals
# It learns coefficients (weights) for each feature and an intercept term
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
# Use the trained model to predict quality scores for the test set
# The model will use the learned coefficients to predict quality scores
# for each wine in the test set
y_pred = model.predict(X_test)

In [None]:
# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

In [None]:
# Create a DataFrame to compare actual vs predicted values for a sample of wines
comparison_df = pd.DataFrame({
    'Actual Quality': y_test,
    'Predicted Quality': y_pred,
    'Difference': y_test - y_pred
})

# Display first 10 rows of the comparison
print("Sample Comparison of Actual vs Predicted Wine Quality:")
print(comparison_df.head(10).round(2))


In [None]:
# Create scatter plot comparing actual vs predicted quality for each sample
plt.figure(figsize=(12, 6))
plt.scatter(range(len(y_pred)), y_pred, alpha=0.6, label='Predicted Quality', marker='o')
plt.scatter(range(len(y_test)), y_test, alpha=0.6, label='Actual Quality', marker='^')
plt.xlabel('Sample Index')
plt.ylabel('Wine Quality')
plt.title('Actual vs Predicted Wine Quality per Sample')
plt.legend()
plt.show()


In [None]:
# Visualize model performance with multiple plots
# 1. Scatter plot of actual vs predicted values with perfect prediction line
# 2. Distribution of residuals to check for normality 
# 3. Residuals vs predicted values to check for homoscedasticity
# 4. Feature importance plot based on model coefficients
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Actual Wine Quality')
plt.ylabel('Predicted Wine Quality')
plt.title('Actual vs Predicted Wine Quality')

# Add text box with metrics
plt.text(0.05, 0.95, f'MSE: {mse:.2f}\nR²: {r2:.2f}', 
         transform=plt.gca().transAxes,
         bbox=dict(facecolor='white', alpha=0.8))

plt.show()

# Plot residuals
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Wine Quality')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()
# Plot histogram of residuals
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, alpha=0.7, color='skyblue', edgecolor='black')
plt.axvline(x=0, color='r', linestyle='--', label='Zero Residual')
plt.xlabel('Residual Value')
plt.ylabel('Frequency')
plt.title('Distribution of Residuals')
plt.legend()
plt.show()
# Plot feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': abs(model.coef_)
})
feature_importance = feature_importance.sort_values('Importance', ascending=True)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['Feature'], feature_importance['Importance'])
plt.xlabel('Absolute Coefficient Value')
plt.ylabel('Feature')
plt.title('Feature Importance Based on Model Coefficients')
plt.show()
# Plot prediction error over index
plt.figure(figsize=(10, 6))
plt.plot(range(len(y_test)), abs(residuals), alpha=0.7)
plt.xlabel('Sample Index')
plt.ylabel('Absolute Prediction Error')
plt.title('Prediction Error Over Samples')
plt.axhline(y=abs(residuals).mean(), color='r', linestyle='--', 
           label=f'Mean Error: {abs(residuals).mean():.2f}')
plt.legend()
plt.show()

