In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from nltk.stem import *
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime, timedelta, date
%matplotlib inline
plt.style.use('ggplot')

import nltk
import string

from nltk.corpus import stopwords
pstemmer = nltk.PorterStemmer()

In [2]:
y_col = 'region_2'

In [3]:
df = pd.read_csv('data/winemag-data-130k-v2.csv')

In [4]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
63663,63663,France,"Produced from 10- to 20-year-old vines, this w...",Sur le Fort,88,30.0,Loire Valley,Sancerre,,Roger Voss,@vossroger,Domaine Fouassier 2015 Sur le Fort (Sancerre),Sauvignon Blanc,Domaine Fouassier
23737,23737,Portugal,This blend of Encruzado and Malvasia Fina is b...,Jardim da Estrela Branco,87,10.0,Dão,,,Roger Voss,@vossroger,Magnum Vinhos 2014 Jardim da Estrela Branco Wh...,Portuguese White,Magnum Vinhos
126232,126232,France,The wine has a fine smoky character lying over...,,88,20.0,Bordeaux,Bordeaux Supérieur,,Roger Voss,@vossroger,Château de Macard 2015 Bordeaux Supérieur,Bordeaux-style Red Blend,Château de Macard
124861,124861,US,This bone-dry high-acid wine puts flavors of g...,Pinot Noir,88,25.0,Oregon,Dundee Hills,Willamette Valley,Paul Gregutt,@paulgwine,Stoller 2016 Pinot Noir Rosé (Dundee Hills),Rosé,Stoller
105562,105562,France,91–93. Barrel sample. This is a densely tannic...,Barrel Sample,92,,Bordeaux,Haut-Médoc,,Roger Voss,@vossroger,Château Beaumont 2012 Barrel Sample (Haut-Médoc),Bordeaux-style Red Blend,Château Beaumont


In [5]:
df.description.sample(20).values

array([ 'Fortified dessert wines made from Malbec follow a tradition going back to the Middle Ages. This has a tawny-Port like quality, but is more structured and concentrated with a palate full of raisins and black-fruit flavors.',
       'Rich and structured, yet full and concentrated, this wine has both dark tannins and ripe plum and berry fruits. The acidity streaks through the wine, lending freshness and good aging potential. Drink from 2017.',
       "Charred, tarry, smoky aromas are gritty and outpace this boutique wine's fruit character. Hard and tannic, with citric acidity, this is steered by flavors of charred oak, licorice, chocolate and barrel resin. Toasty, spicy raspberry and blackcurrant flavors finish it up. Drink through 2026.",
       'This shows an evident layer of oak in the form of buttered toast and vanilla notes, but the underlying wine is strong in pineapple and golden mango flavors. Fine coastal acidity and a hint of mineral provide structural balance.',
      

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
Unnamed: 0               129971 non-null int64
country                  129908 non-null object
description              129971 non-null object
designation              92506 non-null object
points                   129971 non-null int64
price                    120975 non-null float64
province                 129908 non-null object
region_1                 108724 non-null object
region_2                 50511 non-null object
taster_name              103727 non-null object
taster_twitter_handle    98758 non-null object
title                    129971 non-null object
variety                  129970 non-null object
winery                   129971 non-null object
dtypes: float64(1), int64(2), object(11)
memory usage: 13.9+ MB


In [7]:
for col in df.columns:
    print(col, df[col].unique().size)

Unnamed: 0 129971
country 44
description 119955
designation 37980
points 21
price 391
province 426
region_1 1230
region_2 18
taster_name 20
taster_twitter_handle 16
title 118840
variety 708
winery 16757


In [8]:
df[y_col].fillna('none').value_counts(1)

none                 0.611367
Central Coast        0.085134
Sonoma               0.069462
Columbia Valley      0.062345
Napa                 0.052427
Willamette Valley    0.026337
California Other     0.020489
Finger Lakes         0.013672
Sierra Foothills     0.011249
Napa-Sonoma          0.008994
Central Valley       0.008171
Southern Oregon      0.007055
Oregon Other         0.005594
Long Island          0.005232
North Coast          0.004493
Washington Other     0.004109
South Coast          0.002093
New York Other       0.001777
Name: region_2, dtype: float64

In [9]:
df_nona = df[df[y_col].fillna('N') != 'N']

In [10]:
label_map = {val: idx for idx, val in enumerate(df[y_col].unique())}

In [11]:
y = df_nona[y_col].replace(label_map).values

In [12]:
reviews = df_nona.description

In [13]:
swords = set(stopwords.words('english'))

In [14]:
ps = PorterStemmer()

In [15]:
word_vecs = reviews\
    .apply(lambda x: [re.sub("[\W]", '', i).lower().strip() for i in x.split()])\
    .apply(lambda x: [ps.stem(i) for i in x if i not in swords and len(i) > 3])

In [16]:
word_vecs.sample(10)

34498     [much, go, except, sweet, toasti, oaklik, arom...
91925     [dark, fragrant, interest, syrah, open, quickl...
64222     [variet, come, hedg, estat, vineyard, allnew, ...
28017     [dri, plum, dri, strawberri, dri, raspberri, m...
64645     [sharp, acid, earn, higher, score, green, mint...
125672    [sweet, viscos, mire, wine, otherwis, tri, sin...
85111     [crisp, zingi, acid, dryness, make, wine, inst...
13723     [fresh, green, appl, herb, note, flit, danc, t...
91871     [highton, raspberri, blackberri, fruit, dark, ...
120083    [oaki, toast, cherri, meet, tangi, juici, acid...
Name: description, dtype: object

In [17]:
back2sent = word_vecs.apply(lambda x: ' '.join(x))

In [18]:
transform = TfidfVectorizer(lowercase=False, min_df=.05)

In [19]:
tf_idf_matrix = transform.fit_transform(back2sent.values)

In [20]:
## build train_test_split
test_train = np.random.random(word_vecs.shape[0]) < .8
Xtr = tf_idf_matrix[test_train]
Xte = tf_idf_matrix[~test_train]
ytr = y[test_train]
yte = y[~test_train]

In [21]:
from sklearn.naive_bayes import GaussianNB

In [28]:
start = datetime.now()
modelgb = GaussianNB()
modelgb.fit(Xtr.toarray(), ytr)
print(modelgb.score(Xte.toarray(), yte))
print((datetime.now() - start).seconds)

0.166948730712
0


In [23]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
start = datetime.now()
modelrf = RandomForestClassifier(n_estimators=250, n_jobs=-1)
modelrf.fit(Xtr.toarray(), ytr)
print(modelrf.score(Xte.toarray(), yte))
print((datetime.now() - start).seconds)

0.523046291687
8


In [26]:
## run this line if needed xgboost is not a part of Sklearn
## we could also use GradientBoostingClassifier, but since it does not support 
## parallel processing, and XGboost typically gives near the same performance
## we will use XGboost
#!pip install xgboost

In [30]:
import xgboost as xgb

In [34]:
start = datetime.now()
modelxg3 = xgb.XGBClassifier(max_depth=3, learning_rate=.05, n_estimators=250, n_jobs=-1)
modelxg3.fit(Xtr.toarray(), ytr)
print(modelxg3.score(Xte.toarray(), yte))
print((datetime.now() - start).seconds)

0.4606271777
97


In [35]:
start = datetime.now()
modelxg5 = xgb.XGBClassifier(max_depth=5, learning_rate=.05, n_estimators=250, n_jobs=-1)
modelxg5.fit(Xtr.toarray(), ytr)
print(modelxg5.score(Xte.toarray(), yte))
print((datetime.now() - start).seconds)

0.483225485316
160


In [36]:
start = datetime.now()
modelxg_lr1 = xgb.XGBClassifier(max_depth=5, learning_rate=.1, n_estimators=250, n_jobs=-1)
modelxg_lr1.fit(Xtr.toarray(), ytr)
print(modelxg_lr1.score(Xte.toarray(), yte))
print((datetime.now() - start).seconds)

0.493280238925
159


In [37]:
start = datetime.now()
modelxg_t1000 = xgb.XGBClassifier(max_depth=5, learning_rate=.1, n_estimators=250, n_jobs=-1)
modelxg_t1000.fit(Xtr.toarray(), ytr)
print(modelxg_t1000.score(Xte.toarray(), yte))
print((datetime.now() - start).seconds)

0.493280238925
160


In [None]:
# dense_matrix = pd.DataFrame(tf_idf_matrix.todense())

# ## these may be in order of count, not the orde of the columns so this is were i messed up
# dense_matrix.columns = [v for v in transform.vocabulary_.keys()]

# dense_matrix.columns

# dense_matrix.sample(10)

# country_dm = dense_matrix.copy()

# country_dm['y'] = df['country']

In [None]:
start = datetime.now()
#process things
print((datetime.now() - start).seconds)

In [None]:
%%timeit -n 1