In [77]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Import Data

In [58]:
red_wine_data = 'Resources/winequality-red.csv'

red_wine_df = pd.read_csv(red_wine_data, sep = ';')

red_wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## Clean and Process Data

**Check for Null Values**

In [59]:
red_wine_df.count()

fixed acidity           1599
volatile acidity        1599
citric acid             1599
residual sugar          1599
chlorides               1599
free sulfur dioxide     1599
total sulfur dioxide    1599
density                 1599
pH                      1599
sulphates               1599
alcohol                 1599
quality                 1599
dtype: int64

In [60]:
red_wine_df.notna().count()

fixed acidity           1599
volatile acidity        1599
citric acid             1599
residual sugar          1599
chlorides               1599
free sulfur dioxide     1599
total sulfur dioxide    1599
density                 1599
pH                      1599
sulphates               1599
alcohol                 1599
quality                 1599
dtype: int64

In [61]:
red_wine_df.shape

(1599, 12)

**Choose Target and Feature Variables**

In [62]:
y = red_wine_df['quality'].values
X = red_wine_df.drop(columns = 'quality', axis = 1).values

In [63]:
X.shape

(1599, 11)

In [64]:
len(y)

1599

## Split Training and Testing Data

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

## Standardize the Features

In [66]:
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [67]:
len(y_train)

1199

In [68]:
len(y_test)

400

In [69]:
X_train_scaled.shape

(1199, 11)

In [70]:
X_test_scaled.shape

(400, 11)

## Multiple Linear Regression

In [71]:
MLR = LinearRegression()

MLR.fit(X_train_scaled, y_train)

y_pred = MLR.predict(X_test_scaled)


In [72]:
len(y_pred)

400

**Testing and Training Score**

In [76]:
MLR_training_score = MLR.score(X_train_scaled, y_train)

MLR_testing_score = MLR.score(X_test_scaled, y_test)

print(f'Multiple Linear Regression Training Score: {MLR_training_score}')
print(f'Multiple Linear Regression Testing Score: {MLR_testing_score}')

Multiple Linear Regression Training Score: 0.3636257097476555
Multiple Linear Regression Testing Score: 0.3436370198437032


**R2 Score**

In [82]:
print(f'Multiple Linear Regression R2 Score : {r2_score(y_test, y_pred)}')

Multiple Linear Regression R2 Score : 0.3436370198437032


Multiple Linear Regression Analysis: 

The R2 Score which measures how predictable the model is has a value of about 34%.  Based on this score, I conclude that the regression model is not a a g