In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import pandas_profiling

import ppscore as pps

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier #Import Random Forest classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

In [2]:
df_train_1 = pd.read_csv('train.csv')
df_test_1 = pd.read_csv('test.csv')

In [3]:
df_train = df_train_1.copy()  #creating a copy to avoid modifying original dataset
df_test = df_test_1.copy()

In [4]:
df_train.shape

(82657, 12)

In [5]:
df_test.shape

(20665, 11)

# Train Data cleaning 

In [7]:
df_train.isnull().sum()

user_name             19393
country                  35
review_title              0
review_description        0
designation           23647
points                    0
price                  5569
province                 35
region_1              12754
region_2              46708
winery                    0
variety                   0
dtype: int64

In [8]:
df_train.isna().sum()

user_name             19393
country                  35
review_title              0
review_description        0
designation           23647
points                    0
price                  5569
province                 35
region_1              12754
region_2              46708
winery                    0
variety                   0
dtype: int64

In [9]:
cols = ['winery','country','region_1','province']  #feature cols
rev = ['user_name','review_title','review_description','points']  #review_cols

### Feature selection as per train notebook

In [10]:
df_train = df_train.drop(rev,axis=1) #dropping review cols
df_test = df_test.drop(rev,axis=1)

In [11]:
df_train = df_train.drop('region_2',axis=1) #dropping region_2 (feature selection)
df_test = df_test.drop('region_2',axis=1) 

In [12]:
df_train.shape

(82657, 7)

In [13]:
df_test.shape

(20665, 6)

In [14]:
df_train = df_train.dropna()  #removing missing values

In [15]:
df_train.shape

(45259, 7)

In [16]:
df_test.shape

(20665, 6)

In [17]:
df_train = df_train.reset_index()
df_train

Unnamed: 0,index,country,designation,price,province,region_1,winery,variety
0,0,Australia,Peace Family Vineyard,10.0,Australia Other,South Eastern Australia,Andrew Peace,Chardonnay
1,2,Italy,Conca,80.0,Piedmont,Barolo,Renato Ratti,Nebbiolo
2,3,France,L'Abbaye,22.0,Southwest France,Bergerac Sec,Domaine l'Ancienne Cure,Bordeaux-style White Blend
3,4,France,Le Cèdre Vintage,33.0,France Other,Vin de Liqueur,Château du Cèdre,Malbec
4,5,Argentina,Finca La Escondida Reserva,13.0,Other,San Juan,Andean,Cabernet Sauvignon
...,...,...,...,...,...,...,...,...
45254,82646,France,Notre Dame des Champs,65.0,Southwest France,Cahors,Domaine de Cause,Malbec
45255,82652,Spain,Crianza,12.0,Northern Spain,Rioja,Montecillo,Tempranillo
45256,82653,US,Single Vineyard,21.0,New Mexico,New Mexico,Vivác Winery,Cabernet Sauvignon
45257,82654,France,Nouveau,14.0,Beaujolais,Beaujolais-Villages,Domaine de la Madone,Gamay


In [18]:
df1 = pd.get_dummies(df_train[cols])  # encode categorical variables

In [19]:
df1

Unnamed: 0,winery_1+1=3,winery_100 Percent Wine,winery_1000 Stories,winery_12 Linajes,winery_12C Wines,winery_14 Hands,winery_1752 Signature Wines,winery_18401 Cellars,winery_2 Lads,winery_29 & Oak Wines,...,province_Spain Other,province_Spanish Islands,province_Tasmania,province_Texas,province_Tuscany,province_Veneto,province_Victoria,province_Virginia,province_Washington,province_Western Australia
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45254,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45255,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45256,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45257,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
df1 = df1.reset_index()

In [21]:
df1

Unnamed: 0,index,winery_1+1=3,winery_100 Percent Wine,winery_1000 Stories,winery_12 Linajes,winery_12C Wines,winery_14 Hands,winery_1752 Signature Wines,winery_18401 Cellars,winery_2 Lads,...,province_Spain Other,province_Spanish Islands,province_Tasmania,province_Texas,province_Tuscany,province_Veneto,province_Victoria,province_Virginia,province_Washington,province_Western Australia
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45254,45254,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45255,45255,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45256,45256,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45257,45257,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
#concatenate encoded features(categorical) and original features(categorical)
result = pd.concat([df_train,df1],axis=1) 

In [23]:
result.drop(cols,axis=1,inplace=True) #Drop orginal categorical feature and keep encoded ones

In [25]:
result

Unnamed: 0,index,designation,price,variety,index.1,winery_1+1=3,winery_100 Percent Wine,winery_1000 Stories,winery_12 Linajes,winery_12C Wines,...,province_Spain Other,province_Spanish Islands,province_Tasmania,province_Texas,province_Tuscany,province_Veneto,province_Victoria,province_Virginia,province_Washington,province_Western Australia
0,0,Peace Family Vineyard,10.0,Chardonnay,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Conca,80.0,Nebbiolo,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,L'Abbaye,22.0,Bordeaux-style White Blend,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Le Cèdre Vintage,33.0,Malbec,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Finca La Escondida Reserva,13.0,Cabernet Sauvignon,4,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45254,82646,Notre Dame des Champs,65.0,Malbec,45254,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45255,82652,Crianza,12.0,Tempranillo,45255,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45256,82653,Single Vineyard,21.0,Cabernet Sauvignon,45256,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45257,82654,Nouveau,14.0,Gamay,45257,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
result.drop('index',axis=1,inplace=True)

In [27]:
result = result.drop('designation',axis=1) #Feature Selection(Refer Train notebook source code)

In [28]:
result.shape

(45259, 9123)

In [29]:
result

Unnamed: 0,price,variety,winery_1+1=3,winery_100 Percent Wine,winery_1000 Stories,winery_12 Linajes,winery_12C Wines,winery_14 Hands,winery_1752 Signature Wines,winery_18401 Cellars,...,province_Spain Other,province_Spanish Islands,province_Tasmania,province_Texas,province_Tuscany,province_Veneto,province_Victoria,province_Virginia,province_Washington,province_Western Australia
0,10.0,Chardonnay,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,80.0,Nebbiolo,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22.0,Bordeaux-style White Blend,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,33.0,Malbec,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13.0,Cabernet Sauvignon,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45254,65.0,Malbec,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45255,12.0,Tempranillo,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45256,21.0,Cabernet Sauvignon,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45257,14.0,Gamay,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
X_train = result.drop('variety',axis=1)  #all columns except variety 

In [31]:
y_train = result['variety']  #only target variable(variety)

In [32]:
X_train

Unnamed: 0,price,winery_1+1=3,winery_100 Percent Wine,winery_1000 Stories,winery_12 Linajes,winery_12C Wines,winery_14 Hands,winery_1752 Signature Wines,winery_18401 Cellars,winery_2 Lads,...,province_Spain Other,province_Spanish Islands,province_Tasmania,province_Texas,province_Tuscany,province_Veneto,province_Victoria,province_Virginia,province_Washington,province_Western Australia
0,10.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,80.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,33.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45254,65.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45255,12.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45256,21.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45257,14.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
y_train

0                        Chardonnay
1                          Nebbiolo
2        Bordeaux-style White Blend
3                            Malbec
4                Cabernet Sauvignon
                    ...            
45254                        Malbec
45255                   Tempranillo
45256            Cabernet Sauvignon
45257                         Gamay
45258            Cabernet Sauvignon
Name: variety, Length: 45259, dtype: object

# Test data feature encoding for model

In [35]:
df2 = pd.get_dummies(df_test[cols]) #encode categorical cols of test data

In [36]:
df2 = df2.reset_index()

In [37]:
df2

Unnamed: 0,index,winery_1+1=3,winery_10 Knots,winery_10Span,winery_13 Celsius,winery_14 Hands,winery_16X20,winery_1848 Winery,winery_1850,winery_1912 Winemakers,...,province_Weinviertel,province_Wellington,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20660,20660,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20661,20661,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20662,20662,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20663,20663,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
result2 = pd.concat([df_test,df2],axis=1) #concatenate original cols and encoded ones

In [39]:
result2.drop(cols,axis=1,inplace=True) #drop origial cols

In [40]:
result2

Unnamed: 0,designation,price,index,winery_1+1=3,winery_10 Knots,winery_10Span,winery_13 Celsius,winery_14 Hands,winery_16X20,winery_1848 Winery,...,province_Weinviertel,province_Wellington,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa
0,Athena,35.0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Gran Reserva by Richard Bonvin,60.0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,,38.0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Chento Vineyard Selection,20.0,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,,49.0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20660,Rennie Vineyard,34.0,20660,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20661,Senhal d'Aric,20.0,20661,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20662,Reserve,85.0,20662,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20663,Tempranillo,9.0,20663,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
result2.drop('index',axis=1,inplace=True) #Feature selection for model

In [42]:
result2 = result2.drop('designation',axis=1) #Feature selection for model 

In [43]:
result2.shape

(20665, 8991)

In [44]:
result2

Unnamed: 0,price,winery_1+1=3,winery_10 Knots,winery_10Span,winery_13 Celsius,winery_14 Hands,winery_16X20,winery_1848 Winery,winery_1850,winery_1912 Winemakers,...,province_Weinviertel,province_Wellington,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa
0,35.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20660,34.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20661,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20662,85.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20663,9.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
X_test = result2  #Initialize X_test for model

In [46]:
X_test

Unnamed: 0,price,winery_1+1=3,winery_10 Knots,winery_10Span,winery_13 Celsius,winery_14 Hands,winery_16X20,winery_1848 Winery,winery_1850,winery_1912 Winemakers,...,province_Weinviertel,province_Wellington,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa
0,35.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20660,34.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20661,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20662,85.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20663,9.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [47]:
X_train, X_test = X_train.align(X_test, join='inner', axis=1)  
#Ensure features of both train and test are consistent

In [48]:
X_train

Unnamed: 0,price,winery_1+1=3,winery_14 Hands,winery_2 Lads,winery_2Hawk,winery_3 Horse Ranch Vineyards,winery_3 Steves Winery,winery_:Nota Bene,winery_A Blooming Hill Vineyard,winery_A to Z,...,province_Spain Other,province_Spanish Islands,province_Tasmania,province_Texas,province_Tuscany,province_Veneto,province_Victoria,province_Virginia,province_Washington,province_Western Australia
0,10.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,80.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,33.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,13.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45254,65.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45255,12.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45256,21.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
45257,14.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
X_test

Unnamed: 0,price,winery_1+1=3,winery_14 Hands,winery_2 Lads,winery_2Hawk,winery_3 Horse Ranch Vineyards,winery_3 Steves Winery,winery_:Nota Bene,winery_A Blooming Hill Vineyard,winery_A to Z,...,province_Spain Other,province_Spanish Islands,province_Tasmania,province_Texas,province_Tuscany,province_Veneto,province_Victoria,province_Virginia,province_Washington,province_Western Australia
0,35.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20660,34.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20661,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20662,85.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20663,9.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
rf = RandomForestClassifier()  #Random Forest Classifier for our model

In [51]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [52]:
#Nan values unacceptable for model 
X_test['price'] = X_test['price'].fillna(X_test['price'].mean())  

In [53]:
X_test['price'].isnull().sum()   

0

In [54]:
y_pred =rf.predict(X_test)   #Store predicted variety of test data

In [55]:
y_pred_to_df = pd.DataFrame(y_pred,columns=['variety']) #convert to df for concatenation purposes
y_pred_to_df

Unnamed: 0,variety
0,Pinot Noir
1,Malbec
2,Cabernet Sauvignon
3,Malbec
4,Sangiovese
...,...
20660,Malbec
20661,Rosé
20662,Cabernet Sauvignon
20663,Rosé


In [56]:
result2

Unnamed: 0,price,winery_1+1=3,winery_10 Knots,winery_10Span,winery_13 Celsius,winery_14 Hands,winery_16X20,winery_1848 Winery,winery_1850,winery_1912 Winemakers,...,province_Weinviertel,province_Wellington,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa
0,35.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20660,34.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20661,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20662,85.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20663,9.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
final_result  = pd.concat([result2,y_pred_to_df],axis=1)

In [58]:
final_result  #final result along with predicted variety

Unnamed: 0,price,winery_1+1=3,winery_10 Knots,winery_10Span,winery_13 Celsius,winery_14 Hands,winery_16X20,winery_1848 Winery,winery_1850,winery_1912 Winemakers,...,province_Wellington,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa,variety
0,35.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Pinot Noir
1,60.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Malbec
2,38.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Cabernet Sauvignon
3,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Malbec
4,49.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Sangiovese
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20660,34.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Malbec
20661,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Rosé
20662,85.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Cabernet Sauvignon
20663,9.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Rosé


# Reversal of OneHotEnocode

 We do this so that we obtain the original categorical columns

In [60]:
df_test[cols].columns #Only  feature columns 

Index(['winery', 'country', 'region_1', 'province'], dtype='object')

In [61]:
categorical_cols = df_test[cols].columns

In [62]:
def from_dummies(data, categories, prefix_sep='_'):
    out = data.copy()
    for l in categories:
        cols, labs = [[c.replace(x,"") for c in data.columns if l+prefix_sep in c] for x in ["", l+prefix_sep]]
        out[l] = pd.Categorical(np.array(labs)[np.argmax(data[cols].to_numpy(), axis=1)])
        out.drop(cols, axis=1, inplace=True)
    return out

In [63]:
result2

Unnamed: 0,price,winery_1+1=3,winery_10 Knots,winery_10Span,winery_13 Celsius,winery_14 Hands,winery_16X20,winery_1848 Winery,winery_1850,winery_1912 Winemakers,...,province_Weinviertel,province_Wellington,province_Western Australia,province_Western Cape,province_Wiener Gemischter Satz,province_Zenata,province_Österreichischer Perlwein,province_Österreichischer Sekt,province_Štajerska,province_Župa
0,35.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,60.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,38.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,49.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20660,34.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20661,20.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20662,85.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20663,9.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
final_result_to_df = from_dummies(final_result,categories=categorical_cols)

In [65]:
final_result_to_df  #final df without review columns

Unnamed: 0,price,variety,winery,country,region_1,province
0,35.0,Pinot Noir,Boedecker Cellars,US,Willamette Valley,Oregon
1,60.0,Malbec,Mendoza Vineyards,Argentina,Mendoza,Mendoza Province
2,38.0,Cabernet Sauvignon,Prime,US,Coombsville,California
3,20.0,Malbec,Bodega Cuarto Dominio,Argentina,Mendoza,Mendoza Province
4,49.0,Sangiovese,SassodiSole,Italy,Brunello di Montalcino,Tuscany
...,...,...,...,...,...,...
20660,34.0,Malbec,Yorkville Cellars,US,Yorkville Highlands,California
20661,20.0,Rosé,Château Ribaute,France,Corbières,Languedoc-Roussillon
20662,85.0,Cabernet Sauvignon,Daou,US,Paso Robles,California
20663,9.0,Rosé,Peñascal,Spain,Vino de la Tierra de Castilla y León,Northern Spain


#### Combining review features ,designation, region_2 which was dropped earlier with final_result

In [69]:
dropped_cols = ['user_name', 'review_title', 'review_description', 'points','designation','region_2']
test_df_review_cols = df_test_1[dropped_cols]
test_df_review_cols

Unnamed: 0,user_name,review_title,review_description,points,designation,region_2
0,@paulgwine,Boedecker Cellars 2011 Athena Pinot Noir (Will...,Nicely differentiated from the companion Stewa...,88,Athena,Willamette Valley
1,@wineschach,Mendoza Vineyards 2012 Gran Reserva by Richard...,"Charred, smoky, herbal aromas of blackberry tr...",90,Gran Reserva by Richard Bonvin,
2,@vboone,Prime 2013 Chardonnay (Coombsville),"Slightly sour and funky in earth, this is a re...",87,,Napa
3,@wineschach,Bodega Cuarto Dominio 2012 Chento Vineyard Sel...,"This concentrated, midnight-black Malbec deliv...",91,Chento Vineyard Selection,
4,@kerinokeefe,SassodiSole 2012 Brunello di Montalcino,"Earthy aromas suggesting grilled porcini, leat...",90,,
...,...,...,...,...,...,...
20660,@gordone_cellars,Yorkville Cellars 2013 Rennie Vineyard Caberne...,"Clearly focused and fruit-driven, this wine ha...",91,Rennie Vineyard,North Coast
20661,@laurbuzz,Château Ribaute 2015 Senhal d'Aric Rosé (Corbi...,Herbal tones of bay and rosemary are upfront o...,84,Senhal d'Aric,
20662,@mattkettmann,Daou 2014 Reserve Cabernet Sauvignon (Paso Rob...,"Mocha cream, pencil shaving and dried herb aro...",94,Reserve,Central Coast
20663,@wineschach,Peñascal 2011 Tempranillo Rosé (Vino de la Tie...,Loud citrus and berry aromas precede an overlo...,80,Tempranillo,


In [70]:
final_result_df_all = pd.concat([test_df_review_cols,final_result_to_df],axis=1)
final_result_df_all   #Final df with all feature including predicted variety

Unnamed: 0,user_name,review_title,review_description,points,designation,region_2,price,variety,winery,country,region_1,province
0,@paulgwine,Boedecker Cellars 2011 Athena Pinot Noir (Will...,Nicely differentiated from the companion Stewa...,88,Athena,Willamette Valley,35.0,Pinot Noir,Boedecker Cellars,US,Willamette Valley,Oregon
1,@wineschach,Mendoza Vineyards 2012 Gran Reserva by Richard...,"Charred, smoky, herbal aromas of blackberry tr...",90,Gran Reserva by Richard Bonvin,,60.0,Malbec,Mendoza Vineyards,Argentina,Mendoza,Mendoza Province
2,@vboone,Prime 2013 Chardonnay (Coombsville),"Slightly sour and funky in earth, this is a re...",87,,Napa,38.0,Cabernet Sauvignon,Prime,US,Coombsville,California
3,@wineschach,Bodega Cuarto Dominio 2012 Chento Vineyard Sel...,"This concentrated, midnight-black Malbec deliv...",91,Chento Vineyard Selection,,20.0,Malbec,Bodega Cuarto Dominio,Argentina,Mendoza,Mendoza Province
4,@kerinokeefe,SassodiSole 2012 Brunello di Montalcino,"Earthy aromas suggesting grilled porcini, leat...",90,,,49.0,Sangiovese,SassodiSole,Italy,Brunello di Montalcino,Tuscany
...,...,...,...,...,...,...,...,...,...,...,...,...
20660,@gordone_cellars,Yorkville Cellars 2013 Rennie Vineyard Caberne...,"Clearly focused and fruit-driven, this wine ha...",91,Rennie Vineyard,North Coast,34.0,Malbec,Yorkville Cellars,US,Yorkville Highlands,California
20661,@laurbuzz,Château Ribaute 2015 Senhal d'Aric Rosé (Corbi...,Herbal tones of bay and rosemary are upfront o...,84,Senhal d'Aric,,20.0,Rosé,Château Ribaute,France,Corbières,Languedoc-Roussillon
20662,@mattkettmann,Daou 2014 Reserve Cabernet Sauvignon (Paso Rob...,"Mocha cream, pencil shaving and dried herb aro...",94,Reserve,Central Coast,85.0,Cabernet Sauvignon,Daou,US,Paso Robles,California
20663,@wineschach,Peñascal 2011 Tempranillo Rosé (Vino de la Tie...,Loud citrus and berry aromas precede an overlo...,80,Tempranillo,,9.0,Rosé,Peñascal,Spain,Vino de la Tierra de Castilla y León,Northern Spain


In [72]:
df_test_1  #for reference with original test dataset

Unnamed: 0,user_name,country,review_title,review_description,designation,points,price,province,region_1,region_2,winery
0,@paulgwine,US,Boedecker Cellars 2011 Athena Pinot Noir (Will...,Nicely differentiated from the companion Stewa...,Athena,88,35.0,Oregon,Willamette Valley,Willamette Valley,Boedecker Cellars
1,@wineschach,Argentina,Mendoza Vineyards 2012 Gran Reserva by Richard...,"Charred, smoky, herbal aromas of blackberry tr...",Gran Reserva by Richard Bonvin,90,60.0,Mendoza Province,Mendoza,,Mendoza Vineyards
2,@vboone,US,Prime 2013 Chardonnay (Coombsville),"Slightly sour and funky in earth, this is a re...",,87,38.0,California,Coombsville,Napa,Prime
3,@wineschach,Argentina,Bodega Cuarto Dominio 2012 Chento Vineyard Sel...,"This concentrated, midnight-black Malbec deliv...",Chento Vineyard Selection,91,20.0,Mendoza Province,Mendoza,,Bodega Cuarto Dominio
4,@kerinokeefe,Italy,SassodiSole 2012 Brunello di Montalcino,"Earthy aromas suggesting grilled porcini, leat...",,90,49.0,Tuscany,Brunello di Montalcino,,SassodiSole
...,...,...,...,...,...,...,...,...,...,...,...
20660,@gordone_cellars,US,Yorkville Cellars 2013 Rennie Vineyard Caberne...,"Clearly focused and fruit-driven, this wine ha...",Rennie Vineyard,91,34.0,California,Yorkville Highlands,North Coast,Yorkville Cellars
20661,@laurbuzz,France,Château Ribaute 2015 Senhal d'Aric Rosé (Corbi...,Herbal tones of bay and rosemary are upfront o...,Senhal d'Aric,84,20.0,Languedoc-Roussillon,Corbières,,Château Ribaute
20662,@mattkettmann,US,Daou 2014 Reserve Cabernet Sauvignon (Paso Rob...,"Mocha cream, pencil shaving and dried herb aro...",Reserve,94,85.0,California,Paso Robles,Central Coast,Daou
20663,@wineschach,Spain,Peñascal 2011 Tempranillo Rosé (Vino de la Tie...,Loud citrus and berry aromas precede an overlo...,Tempranillo,80,9.0,Northern Spain,Vino de la Tierra de Castilla y León,,Peñascal


In [76]:
#Export result to csv
pred_csv = final_result_df_all.to_csv('test_data_with_prediction.csv')

--------------------------------------------------------------------------------
#### Author: Vishak G