# Intro to Regression: White Wine Quality

## Import and understand the data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn

In [None]:
df = pd.read_csv("white.csv", sep=';')
df.head()

In [None]:
df.describe()

In [None]:
df.info()

## Data Visualization

In [None]:
f = plt.figure(figsize=(12, 10))
corr = df.corr()
plt.matshow(corr,fignum=f.number)
plt.xticks(range(len(corr.columns)),corr.columns, fontsize=12, rotation=45)
plt.yticks(range(len(corr.columns)),corr.columns, fontsize=12)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=10)
plt.title('Correlation Matrix', fontsize=16)

In [None]:
figure, axis = plt.subplots(6,2,figsize=(90,90))
figure.tight_layout(pad=10,w_pad=10,h_pad=10)
sns.histplot(x='fixed acidity',data=df,ax=axis[0][0]) 
sns.histplot(x='volatile acidity',data=df,ax=axis[0][1]) 
sns.histplot(x='citric acid',data=df,ax=axis[1][0]) 
sns.histplot(x='residual sugar',data=df,ax=axis[1][1])
sns.histplot(x='chlorides',data=df,ax=axis[2][0]) 
sns.histplot(x='free sulfur dioxide',data=df,ax=axis[2][1]) 
sns.histplot(x='total sulfur dioxide',data=df,ax=axis[3][0]) 
sns.histplot(x='density',data=df,ax=axis[3][1])
sns.histplot(x='pH',data=df,ax=axis[4][0]) 
sns.histplot(x='sulphates',data=df,ax=axis[4][1]) 
sns.histplot(x='alcohol',data=df,ax=axis[5][0]) 
sns.histplot(x='quality',data=df,ax=axis[5][1])
plt.rcParams.update({'font.size':60})

In [None]:
figure, axis = plt.subplots(11,1,figsize=(90,300))
figure.tight_layout(pad=4,w_pad=4,h_pad=4)
sns.scatterplot(x="fixed acidity", y="quality", data=df, ax=axis[0],s=200)
sns.scatterplot(x="volatile acidity", y="quality", data=df, ax=axis[1],s=200)
sns.scatterplot(x="citric acid", y="quality", data=df, ax=axis[2],s=200)
sns.scatterplot(x="residual sugar", y="quality", data=df, ax=axis[3],s=200)
sns.scatterplot(x="chlorides", y="quality", data=df, ax=axis[4],s=200)
sns.scatterplot(x="free sulfur dioxide", y="quality", data=df, ax=axis[5],s=200)
sns.scatterplot(x="total sulfur dioxide", y="quality", data=df, ax=axis[6],s=200)
sns.scatterplot(x="density", y="quality", data=df, ax=axis[7],s=200)
sns.scatterplot(x="pH", y="quality", data=df, ax=axis[8],s=200)
sns.scatterplot(x="sulphates", y="quality", data=df, ax=axis[9],s=200)
sns.scatterplot(x="alcohol", y="quality", data=df, ax=axis[10],s=200)
plt.rcParams.update({'font.size':100})

## Preparing the data

In [None]:
y = df["quality"]
X = df.drop('quality',axis=1)
print(y.head())
print(X.head())

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## Testing the models

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from math import sqrt
lr = LinearRegression()
lr.fit(X_train,y_train)
lrPred = lr.predict(X_test)
print(sqrt(mean_squared_error(y_test,lrPred)))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
rfPred = rf.predict(X_test)
print(sqrt(mean_squared_error(y_test,rfPred)))