In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, RepeatedKFold

from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, RidgeCV

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Getting data from scraped csv file

In [2]:
df = pd.read_csv("WineData.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Id,Name,Price,Rating,NumRatings,Type,StyleName,Year,Body,Acidity,Vintage,Nat,Region,Country,Size,Winery
0,0,1416855,Bota Box Redvolution 2011,5.02,3.5,1957,1,Californian Red Blend,2011,4,3,0,0,California,United States,500,Bota Box
1,1,88766858,Crane Lake Cabernet Sauvignon 2016,5.49,3.2,3101,1,Californian Cabernet Sauvignon,2016,4,3,0,0,California,United States,750,Crane Lake
2,2,141258254,Crane Lake Pinot Noir 2016,5.49,3.2,1466,1,Californian Pinot Noir,2016,3,3,0,0,California,United States,750,Crane Lake
3,3,157294384,Oro Bello Rosé Gold 2017,5.25,3.8,50,4,Californian Rosé,2017,3,2,0,0,California,United States,750,Oro Bello
4,4,1395988,Sutter Home White Zinfandel 2012,6.67,3.6,6112,4,NONE,2012,-1,-1,0,0,St. Helena,United States,750,Sutter Home


# Cleaning the data

In [3]:
df.columns

Index(['Unnamed: 0', 'Id', 'Name', 'Price', 'Rating', 'NumRatings', 'Type',
       'StyleName', 'Year', 'Body', 'Acidity', 'Vintage', 'Nat', 'Region',
       'Country', 'Size', 'Winery'],
      dtype='object')

In [4]:
#Drop all categorical/useless columns
numeric = df.drop(columns=['Unnamed: 0', 'Id', 'Name', 'StyleName', 'Region', 'Winery'])
#Drop all records with body or acidity = -1
numeric = numeric.loc[~((numeric["Body"] == -1) | (numeric["Acidity"] == -1) | (numeric["Year"] == 0))]
numeric.head()

Unnamed: 0,Price,Rating,NumRatings,Type,Year,Body,Acidity,Vintage,Nat,Country,Size
0,5.02,3.5,1957,1,2011,4,3,0,0,United States,500
1,5.49,3.2,3101,1,2016,4,3,0,0,United States,750
2,5.49,3.2,1466,1,2016,3,3,0,0,United States,750
3,5.25,3.8,50,4,2017,3,2,0,0,United States,750
6,7.08,3.7,7880,4,2018,3,2,0,0,United States,750


In [5]:
#Making dummy columns for each country
countries = numeric.Country.unique()

for country in countries:
    numeric[country] = np.where(numeric["Country"] == country, 1, 0)

In [6]:
#Remove country column and make final cleaned data
data = numeric.drop(columns=["Country"])
data.reset_index(inplace=True,drop=True)
data.head()

Unnamed: 0,Price,Rating,NumRatings,Type,Year,Body,Acidity,Vintage,Nat,Size,...,NONE,Greece,Uruguay,Mexico,New Zealand,Germany,Hungary,Austria,Brazil,United Kingdom
0,5.02,3.5,1957,1,2011,4,3,0,0,500,...,0,0,0,0,0,0,0,0,0,0
1,5.49,3.2,3101,1,2016,4,3,0,0,750,...,0,0,0,0,0,0,0,0,0,0
2,5.49,3.2,1466,1,2016,3,3,0,0,750,...,0,0,0,0,0,0,0,0,0,0
3,5.25,3.8,50,4,2017,3,2,0,0,750,...,0,0,0,0,0,0,0,0,0,0
4,7.08,3.7,7880,4,2018,3,2,0,0,750,...,0,0,0,0,0,0,0,0,0,0


# TODO: Remove Outliers

# Linear Regression on log price

In [7]:
#Create X and y
#Create log price column
data['log_price'] = np.log(data["Price"])

#X is every column but log price, y is only log price
X = data.drop(columns=["log_price","Price"])
y = data[["log_price"]]
X.head()

Unnamed: 0,Rating,NumRatings,Type,Year,Body,Acidity,Vintage,Nat,Size,United States,...,NONE,Greece,Uruguay,Mexico,New Zealand,Germany,Hungary,Austria,Brazil,United Kingdom
0,3.5,1957,1,2011,4,3,0,0,500,1,...,0,0,0,0,0,0,0,0,0,0
1,3.2,3101,1,2016,4,3,0,0,750,1,...,0,0,0,0,0,0,0,0,0,0
2,3.2,1466,1,2016,3,3,0,0,750,1,...,0,0,0,0,0,0,0,0,0,0
3,3.8,50,4,2017,3,2,0,0,750,1,...,0,0,0,0,0,0,0,0,0,0
4,3.7,7880,4,2018,3,2,0,0,750,1,...,0,0,0,0,0,0,0,0,0,0


# Linear Regression with scaled data

In [8]:
#Scale X
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

reg = LinearRegression().fit(X_scaled, y)
reg.score(X_scaled, y)

0.6799687472224125

In [11]:
np.exp(reg.predict(X_scaled))

array([[  7.05338031],
       [  5.3534978 ],
       [  6.26881321],
       ...,
       [122.50125023],
       [147.41339706],
       [ 40.36335267]])

# PCA Linear Regression with scaling

In [24]:
#Scale X
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

#Transform X
pca_scaled = PCA(n_components=len(list(X_scaled.columns)))
X_transformed = pca_scaled.fit_transform(X_scaled)
X_transformed = X_transformed[:, :15]

#Train on X scaled and transformed
reg = LinearRegression().fit(X_transformed, y)
reg.score(X_transformed, y)

0.5371966901283736

In [25]:
np.exp(reg.predict(X_transformed[:,:]))

array([[ 8.42996422],
       [ 7.70594391],
       [ 8.08955904],
       ...,
       [66.45593336],
       [66.94631131],
       [26.27288416]])

# Straight Gradient Boosting

In [14]:
reg = GradientBoostingRegressor().fit(X, y)
reg.score(X, y)

  return f(*args, **kwargs)


0.9194966787625258

In [15]:
np.exp(reg.predict(X))

array([  7.88316275,   9.05999147,   9.13010406, ..., 436.81829434,
       300.53741959,  22.69975446])

# GradientBoosting regression with scaling

In [17]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

reg = GradientBoostingRegressor().fit(X_scaled, y)
reg.score(X_scaled, y)

  return f(*args, **kwargs)


0.9194966787625258

In [18]:
np.exp(reg.predict(X_scaled))

array([  7.88316275,   9.05999147,   9.13010406, ..., 436.81829434,
       300.53741959,  22.69975446])

# PCA GradientBoosting regression with scaling

In [28]:
#Scale X
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

#Transform X
pca_scaled = PCA(n_components=len(list(X_scaled.columns)))
X_transformed = pca_scaled.fit_transform(X_scaled)
X_transformed = X_transformed[:, :10]

#Train on X scaled and transformed
reg = GradientBoostingRegressor().fit(X_transformed, y)
reg.score(X_transformed, y)

  return f(*args, **kwargs)


0.9074566551141494

In [29]:
np.exp(reg.predict(X_transformed[:,:10]))

array([ 10.27943425,   9.9137894 ,  10.33711909, ..., 395.87033649,
       329.6152157 ,  21.04016726])

# PCA RandomForest regression with scaling

In [30]:
#Scale X
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

#Transform X
pca_scaled = PCA(n_components=len(list(X_scaled.columns)))
X_transformed = pca_scaled.fit_transform(X_scaled)
X_transformed = X_transformed[:, :10]

#Train on X scaled and transformed
reg = RandomForestRegressor().fit(X_transformed, y)
reg.score(X_transformed, y)

  reg = RandomForestRegressor().fit(X_transformed, y)


0.9847216876868712

In [31]:
np.exp(reg.predict(X_transformed[:,:10]))

array([  6.48731026,   6.11766532,   7.45736293, ..., 419.55939984,
       316.8144252 ,  15.29349059])

# PCA Bagging regression with scaling

In [34]:
#Scale X
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

#Transform X
pca_scaled = PCA(n_components=len(list(X_scaled.columns)))
X_transformed = pca_scaled.fit_transform(X_scaled)
X_transformed = X_transformed[:, :10]

#Train on X scaled and transformed
reg = BaggingRegressor().fit(X_transformed, y)
reg.score(X_transformed, y)

  return f(*args, **kwargs)


0.9798869787483762

In [35]:
np.exp(reg.predict(X_transformed[:,:10]))

array([  5.86397126,   6.57907979,   6.56226587, ..., 416.40017769,
       429.67444959,  16.68069912])

In [21]:
data

Unnamed: 0,Price,Rating,NumRatings,Type,Year,Body,Acidity,Vintage,Nat,Size,...,Greece,Uruguay,Mexico,New Zealand,Germany,Hungary,Austria,Brazil,United Kingdom,log_price
0,5.02,3.5,1957,1,2011,4,3,0,0,500,...,0,0,0,0,0,0,0,0,0,1.613430
1,5.49,3.2,3101,1,2016,4,3,0,0,750,...,0,0,0,0,0,0,0,0,0,1.702928
2,5.49,3.2,1466,1,2016,3,3,0,0,750,...,0,0,0,0,0,0,0,0,0,1.702928
3,5.25,3.8,50,4,2017,3,2,0,0,750,...,0,0,0,0,0,0,0,0,0,1.658228
4,7.08,3.7,7880,4,2018,3,2,0,0,750,...,0,0,0,0,0,0,0,0,0,1.957274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4924,29.00,3.9,744,3,2017,1,3,0,0,750,...,0,0,0,0,0,0,0,0,0,3.367296
4925,89.99,4.1,2208,3,2007,3,3,0,0,750,...,0,0,0,0,0,0,0,0,0,4.499699
4926,385.00,4.6,37809,3,2000,3,3,0,0,750,...,0,0,0,0,0,0,0,0,0,5.953243
4927,175.00,4.7,92,3,2008,3,3,0,0,750,...,0,0,0,0,0,0,0,0,0,5.164786


# Linear Regression Observations
Straight linear regression 

# KMeans Clustering

In [None]:
index = len(data)*(2/3)
X_train, X_test = data.loc[:index,:], data.loc[index:,:]

In [None]:
X_train, X_test

In [None]:
#Kmeans
kmeans = KMeans(n_clusters=10, random_state=0).fit(X_train)

In [None]:
cluster_map = pd.DataFrame()
cluster_map['data_index'] = X_train.index.values
cluster_map['cluster'] = kmeans.labels_

cluster_indexs = {}
for cluster in range(10):
    cluster_indexs[cluster] = cluster_map[cluster_map.cluster == cluster].data_index.values
cluster_indexs