# Wine reviews part 2

With the added columns linking the tasters with the reviews let's dive into that in this python notebook.

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.model_selection import train_test_split
import pydotplus
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

Since the added columns are only present in the update file we only need to read that one in. The older reviews would be dropped anyways.

In [3]:
df = pd.read_csv("../input/winemag-data-130k-v2.csv", index_col=0)

In [4]:
df.describe()

Unnamed: 0,points,price
count,129971.0,120975.0
mean,88.447138,35.363389
std,3.03973,41.022218
min,80.0,4.0
25%,86.0,17.0
50%,88.0,25.0
75%,91.0,42.0
max,100.0,3300.0


In [5]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [10]:
df_nop = df.drop('designation', 1)
df_nop = df_nop.drop('region_1', 1)
df_nop = df_nop.drop('region_2', 1)
df_nop = df_nop.drop('description', 1)
df_nop = df_nop.drop('taster_twitter_handle', 1)
df_nop = df_nop.drop('title', 1)

In [11]:
df_nop = df_nop.dropna(how='any') # dropping any rows that have na values

df_nop.head()

Unnamed: 0,country,points,price,province,taster_name,variety,winery
1,Portugal,87,15.0,Douro,Roger Voss,Portuguese Red,Quinta dos Avidagos
2,US,87,14.0,Oregon,Paul Gregutt,Pinot Gris,Rainstorm
3,US,87,13.0,Michigan,Alexander Peartree,Riesling,St. Julian
4,US,87,65.0,Oregon,Paul Gregutt,Pinot Noir,Sweet Cheeks
5,Spain,87,15.0,Northern Spain,Michael Schachner,Tempranillo-Merlot,Tandem


In [12]:
len(df['taster_name'].value_counts())

19

In [20]:
taster_grouped = df_nop.groupby(['taster_name']).mean()
taster_grouped['counts'] = df_nop.groupby(['taster_name'])['taster_name'].count()
taster_grouped = taster_grouped.sort_values(by=['counts'], ascending=False)
taster_grouped = taster_grouped.sort_values(by=['points'], ascending=False)
taster_grouped

Unnamed: 0_level_0,points,price,counts
taster_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Anne Krebiehl MW,90.61257,31.24491,3389
Matt Kettmann,90.033991,38.642136,6237
Virginie Boone,89.216998,46.621963,9507
Mike DeSimone,89.095528,28.132114,492
Paul Gregutt,89.087396,33.644835,9497
Kerin O’Keefe,88.90875,41.953413,9874
Sean P. Sullivan,88.756751,34.085888,4925
Jim Gordon,88.625989,26.935507,4171
Roger Voss,88.606137,38.64996,20172
Joe Czerwinski,88.540519,35.182036,5010


Let's clean up the data like in part 1

In [21]:
y_price = df_nop.points
df_nop = df_nop.drop('points', 1) # 0 is for row, 1 is for column

y_price.head()

1    87
2    87
3    87
4    87
5    87
Name: points, dtype: int64

In [25]:
df_cat = df_nop.copy()
df_cat['country'] = df_cat['country'].astype('category')
df_cat['province'] = df_cat['province'].astype('category')
df_cat['variety'] = df_cat['variety'].astype('category')
df_cat['taster_name'] = df_cat['taster_name'].astype('category')
df_cat['winery'] = df_cat['winery'].astype('category')
df_cat.dtypes

country        category
price           float64
province       category
taster_name    category
variety        category
winery         category
dtype: object

In [26]:
#To change the string values of the country, province, and variety into categorical values for calculation
cat_columns = df_cat.select_dtypes(['category']).columns
cat_columns
df_cat[cat_columns] = df_cat[cat_columns].apply(lambda x: x.cat.codes)

df_cat.head()

Unnamed: 0,country,price,province,taster_name,variety,winery
1,30,15.0,105,15,415,10745
2,39,14.0,263,14,401,10795
3,39,13.0,213,0,443,11895
4,39,65.0,263,14,405,12066
5,36,15.0,257,12,548,12134


In [None]:
#Import LabelEncoder
from sklearn import preprocessing
#creating labelEncoder
le = preprocessing.LabelEncoder()
#Converting string labels into numbers.
weather_encoded=le.fit_transform(weather)
print(weather_encoded)

In [57]:
X_train_cat, X_test_cat, yprice_train, yprice_test = train_test_split(df_cat, y_price, test_size=0.20)
X_train_cat.head()

Unnamed: 0,country,price,province,taster_name,variety,winery
67821,39,25.0,263,12,405,1002
31108,39,30.0,50,7,61,5456
7831,2,34.0,332,8,496,7132
117672,35,40.0,402,10,405,1372
45601,21,40.0,367,9,302,173


## Decision Tree

In [64]:
from sklearn import tree
from sklearn import metrics
model_clf = tree.DecisionTreeClassifier()

In [67]:
model_clf = model_clf.fit(X_train_cat, yprice_train)
#model_clf.score(X_train_cat, yprice_train)
y_pred = model_clf.predict(X_test_cat)

In [68]:
#print(model_clf.score(X_train_cat, yprice_train))
print("Accuracy:",metrics.accuracy_score(yprice_test, y_pred))

Accuracy: 0.2551337896701929


Not so good...

## Random Forests Classification

In [61]:
#random forest classifier
from sklearn.ensemble import RandomForestClassifier
# Gaussian classifier
rand_forest_clf = RandomForestClassifier(n_estimators=100)
# Model training
rand_forest_clf.fit(X_train_cat, yprice_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [62]:
# Prediction
y_pred = rand_forest_clf.predict(X_test_cat)

In [63]:
# Model Accuracy
print("Accuracy:",metrics.accuracy_score(yprice_test, y_pred))

Accuracy: 0.260371292263016


I thought applying random forests instead of a classification and decision tree would yield better results. This is not much better.

## Nearest Neighbours

In [83]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_cat, yprice_train)
y_pred = knn.predict(X_test_cat)

In [84]:
print("Accuracy:",metrics.accuracy_score(yprice_test, y_pred))

Accuracy: 0.18943165318398672


Even worse