In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report,r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Import Data

In [3]:
white_wine_data = 'Resources/winequality-white.csv'

white_wine_df = pd.read_csv(white_wine_data, sep = ';')

white_wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


## Clean and Process Data

**Check for Null Values**

In [6]:
white_wine_df.count()

fixed acidity           4898
volatile acidity        4898
citric acid             4898
residual sugar          4898
chlorides               4898
free sulfur dioxide     4898
total sulfur dioxide    4898
density                 4898
pH                      4898
sulphates               4898
alcohol                 4898
quality                 4898
dtype: int64

In [7]:
white_wine_df.notna().count()

fixed acidity           4898
volatile acidity        4898
citric acid             4898
residual sugar          4898
chlorides               4898
free sulfur dioxide     4898
total sulfur dioxide    4898
density                 4898
pH                      4898
sulphates               4898
alcohol                 4898
quality                 4898
dtype: int64

In [8]:
white_wine_df.shape

(4898, 12)

**Choose Target and Feature Variables**

In [9]:
y = white_wine_df['quality'].values
X = white_wine_df.drop(columns = 'quality', axis = 1).values

In [10]:
X.shape

(4898, 11)

In [11]:
len(y)

4898

## Split Training and Testing Data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

## Standardize the Features

In [13]:
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
len(y_train)

3673

In [15]:
len(y_test)

1225

In [16]:
X_train_scaled.shape

(3673, 11)

In [17]:
X_test_scaled.shape

(1225, 11)

## Multiple Linear Regression

In [18]:
MLR = LinearRegression()

MLR.fit(X_train_scaled, y_train)

y_pred = MLR.predict(X_test_scaled)


In [19]:
len(y_pred)

1225

**Testing and Training Score**

In [20]:
MLR_training_score = MLR.score(X_train_scaled, y_train)

MLR_testing_score = MLR.score(X_test_scaled, y_test)

print(f'Multiple Linear Regression Training Score: {MLR_training_score}')
print(f'Multiple Linear Regression Testing Score: {MLR_testing_score}')

Multiple Linear Regression Training Score: 0.2794269587927002
Multiple Linear Regression Testing Score: 0.2876435954262121


**R2 Score**

In [21]:
print(f'Multiple Linear Regression R2 Score : {r2_score(y_test, y_pred)}')

Multiple Linear Regression R2 Score : 0.2876435954262121


**Multiple Linear Regression Analysis:**

The R2 Score which measures how predictable the model is has a value of almost 29%.  Based on this score, I conclude that the multiple linear regression model is not a a good predictor for the Red Wine Analysis.

## Decision Tree

In [22]:
DTC = DecisionTreeClassifier()

DTC.fit(X_train_scaled, y_train)

y_pred = DTC.predict(X_test_scaled)

**Testing and Training Score**

In [23]:
DTC_training_score = DTC.score(X_train_scaled, y_train)

DTC_training_score = DTC.score(X_test_scaled, y_test)

print(f'Decision Tree Training Score: {DTC_training_score}')
print(f'Decision Tree Regression Testing Score: {DTC_training_score}')

Decision Tree Training Score: 0.593469387755102
Decision Tree Regression Testing Score: 0.593469387755102


**Accuracy Score**

In [24]:
print(f'Decision Tree Accuracy Score: {accuracy_score(y_test, y_pred)}')

Decision Tree Accuracy Score: 0.593469387755102


**Classification Report**

In [25]:
print('Decision Tree Classification Report:')

print(classification_report(y_test, y_pred))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.27      0.19      0.22        37
           5       0.61      0.59      0.60       368
           6       0.62      0.67      0.65       544
           7       0.56      0.53      0.55       233
           8       0.47      0.39      0.43        41

    accuracy                           0.59      1225
   macro avg       0.42      0.39      0.41      1225
weighted avg       0.59      0.59      0.59      1225



**Decision Tree Classification Analysis:**

The accuracy score for this model is about 59%.  This is about 30% better than the multiple linear regression model. 

## Random Forest

In [26]:
RFC = RandomForestClassifier()

RFC.fit(X_train_scaled, y_train)

y_pred = RFC.predict(X_test_scaled)

**Training and Testing Score**

In [27]:
RFC_training_score = RFC.score(X_train_scaled, y_train)

RFC_training_score = RFC.score(X_test_scaled, y_test)

print(f'Random Forest Training Score: {RFC_training_score}')
print(f'Random Forest Regression Testing Score: {RFC_training_score}')

Random Forest Training Score: 0.6669387755102041
Random Forest Regression Testing Score: 0.6669387755102041


**Accuracy Score**

In [28]:
print(f'Random Forest Accuracy Score: {accuracy_score(y_test, y_pred)}')

Random Forest Accuracy Score: 0.6669387755102041


**Classification Report**

In [29]:
print('Random Forest Classification Report:')

print(classification_report(y_test, y_pred))

Random Forest Classification Report:
              precision    recall  f1-score   support

           3       0.00      0.00      0.00         2
           4       0.54      0.19      0.28        37
           5       0.69      0.64      0.66       368
           6       0.63      0.80      0.71       544
           7       0.75      0.54      0.63       233
           8       0.88      0.34      0.49        41

    accuracy                           0.67      1225
   macro avg       0.58      0.42      0.46      1225
weighted avg       0.68      0.67      0.66      1225



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Random Forest Classification Analysis:**

This model's accuracy score is 67%.  