In [12]:
# Import Dependencies
%matplotlib inline
from matplotlib import pyplot as plt
from path import Path
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score, mean_squared_error

In [2]:
# Load dataset
file_path = Path("clean_wine_data.csv")
df_wine = pd.read_csv(file_path)
df_wine.head()

Unnamed: 0,country,points,price,province,region_1,taster_name,title,variety,winery,year,type,points_category
0,Portugal,87,15.0,Douro,Douro,Roger Voss,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,2011,red,average
1,US,87,14.0,Oregon,Willamette Valley,Paul Gregutt,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,2013,white,average
2,US,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,2013,white,average
3,US,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,2012,red,average
4,Spain,87,15.0,Northern Spain,Navarra,Michael Schachner,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem,2011,red,average


## Preprocess the Data

In [5]:
df_wine.title.value_counts()

Domaine Vacheron 2015  Sancerre                                                                    4
Vignerons des Pierres Dorées 2015 Salamandre d'Or  (Coteaux Bourguignons)                          4
Château Lestage Simon 2012  Haut-Médoc                                                             4
Domaine Dubost 2014 Cuvée Tracot  (Beaujolais-Villages)                                            3
Rivetto 2012 Cè Vanin  (Barbaresco)                                                                3
                                                                                                  ..
Alma del Sur 2009 Cabernet Sauvignon (Mendoza)                                                     1
Casa Agricola Alexandre Relvas 2014 Herdade de São Miguel Escolha dos Enólogos Red (Alentejano)    1
Alliance Loire 2014 Azuré Sur Lie  (Muscadet Sèvre et Maine)                                       1
Feudi di San Gregorio 2012 Pietracalda  (Fiano di Avellino)                                

In [7]:
df_wine.variety.value_counts()

Pinot Noir                  12479
Chardonnay                  10660
Cabernet Sauvignon           9092
Red Blend                    8190
Bordeaux-style Red Blend     5276
                            ...  
Riesling-Chardonnay             1
Teroldego Rotaliano             1
Tinta Amarela                   1
Frankovka                       1
Mavrokalavryta                  1
Name: variety, Length: 670, dtype: int64

In [8]:
df_wine.winery.value_counts()

Testarossa               217
Williams Selyem          211
DFJ Vinhos               209
Wines & Winemakers       206
Chateau Ste. Michelle    191
                        ... 
Château de Brague          1
Château Grand Pontet       1
Charles Fournier           1
Sea Breeze                 1
J. de Telmont              1
Name: winery, Length: 15250, dtype: int64

In [9]:
df_wine = df_wine.drop(columns = ['title', 'winery'], axis = 1)
df_wine.head()

Unnamed: 0,country,points,price,province,region_1,taster_name,variety,year,type,points_category
0,Portugal,87,15.0,Douro,Douro,Roger Voss,Portuguese Red,2011,red,average
1,US,87,14.0,Oregon,Willamette Valley,Paul Gregutt,Pinot Gris,2013,white,average
2,US,87,13.0,Michigan,Lake Michigan Shore,Alexander Peartree,Riesling,2013,white,average
3,US,87,65.0,Oregon,Willamette Valley,Paul Gregutt,Pinot Noir,2012,red,average
4,Spain,87,15.0,Northern Spain,Navarra,Michael Schachner,Tempranillo-Merlot,2011,red,average
...,...,...,...,...,...,...,...,...,...,...
114824,Germany,90,28.0,Mosel,Mosel,Anna Lee C. Iijima,Riesling,2013,white,good
114825,US,90,75.0,Oregon,Oregon,Paul Gregutt,Pinot Noir,2004,red,good
114826,France,90,30.0,Alsace,Alsace,Roger Voss,Gewürztraminer,2013,white,good
114827,France,90,32.0,Alsace,Alsace,Roger Voss,Pinot Gris,2012,white,good


In [13]:
# Generate our categorical variable lists
application_cat = df_wine.dtypes[df_wine.dtypes == "object"].index.tolist()

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df_wine[application_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(application_cat)
encode_df.head()

Unnamed: 0,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Bosnia and Herzegovina,country_Brazil,country_Bulgaria,country_Canada,country_Chile,country_China,...,variety_Zweigelt,variety_Çalkarası,variety_Žilavka,type_red,type_white,points_category_average,points_category_below average,points_category_excellent,points_category_good,points_category_very good
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [14]:
# Merge one-hot encoded features and drop the originals
df_wine = df_wine.merge(encode_df, left_index = True, right_index = True)
df_wine = df_wine.drop(application_cat, 1)
df_wine.head()

Unnamed: 0,points,price,year,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Bosnia and Herzegovina,country_Brazil,country_Bulgaria,...,variety_Zweigelt,variety_Çalkarası,variety_Žilavka,type_red,type_white,points_category_average,points_category_below average,points_category_excellent,points_category_good,points_category_very good
0,87,15.0,2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
1,87,14.0,2013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
2,87,13.0,2013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
3,87,65.0,2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,87,15.0,2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [15]:
# Create our features for the model 
X = df_wine.drop(columns = ['points'])
#X = pd.get_dummies(X)

# Create the target
y = df_wine['points']

In [4]:
# Define the outputs.
y = df_wine["points"].ravel()
y[:5]

array([87, 87, 87, 87, 87], dtype=int64)

In [5]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=27)

In [None]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

In [None]:
# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fit the Random Forest Model

In [None]:
# Create a random forest regression.
rf_model = RandomForestRegressor(n_estimators=500, random_state=27) 

In [None]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

## Make Predictions

In [None]:
# Make predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

## Evaluate the Model

In [None]:
# Calculate the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

In [None]:
# Calculate the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [None]:
# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

## Rank Importance of Features

In [None]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

In [None]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)