In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split 

In [3]:
data = pd.read_csv('/home/vyshnav/winequality-red.csv')

In [4]:
data.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
correlations = data.corr()['quality'].sort_values(ascending=False)
print(correlations)

quality                 1.000000
alcohol                 0.476166
sulphates               0.251397
citric acid             0.226373
fixed acidity           0.124052
residual sugar          0.013732
free sulfur dioxide    -0.050656
pH                     -0.057731
chlorides              -0.128907
density                -0.174919
total sulfur dioxide   -0.185100
volatile acidity       -0.390558
Name: quality, dtype: float64


In [6]:

data.drop("residual sugar",axis=1)
data.drop("free sulfur dioxide",axis=1)
data.drop("pH",axis=1)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,0.71,10.2,5


In [7]:
data.drop_duplicates(subset=None,inplace=True)

In [8]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values

In [9]:
X_norm = (X-X.mean())/X.std()
y_norm = (y-y.mean())/y.std()

In [10]:
num_training = int(0.8 * len(data))
X_train, y_train = X_norm[:num_training], y_norm[:num_training]
X_test, y_test = X_norm[num_training:], y_norm[num_training:]

In [11]:
X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

In [12]:
theta = np.zeros(X_train.shape[1])

In [13]:
def hypothesis(X, theta):
    return np.dot(X, theta)

In [14]:
def cost_function(X, y, theta):
    m = y.shape[0]
    J = (1/(2*m)) * np.sum((hypothesis(X, theta) - y)**2)
    return J

In [15]:
def gradient_descent(X, y, theta, alpha, num_iters):
    m = y.shape[0]
    J_history = []
    
    for i in range(num_iters):
        theta = theta - (alpha/m) * np.dot(X.T, (hypothesis(X, theta) - y))
        J_history.append(cost_function(X, y, theta))
        
    return theta, J_history

In [16]:
alpha = 0.17
num_iters = 440000
theta, J_history = gradient_descent(X_train, y_train, theta, alpha, num_iters)

In [17]:
y_pred = hypothesis(X_test, theta)

In [18]:
mse = np.mean((y_pred - y_test)**2)
print('Mean squared error on the testing set: ', mse)

Mean squared error on the testing set:  0.7232129504801207


In [19]:
def r2_score(y_pred,y):
    rss=np.sum((y_pred-y)**2)
    tss=np.sum((y-y.mean())**2)
    r2=1-(rss/tss)
    return r2

In [20]:

r2_score(y_pred,y_test)

0.25831335551989665