In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

%matplotlib inline

In [2]:
df = pd.read_csv('/Users/marvin/testcases/datarevenue/data_root/raw/wine_dataset.csv')
df = df.drop(df.columns[0], axis=1)
df['country'] = df['country'].fillna('unk')
df['country'] = df['country'].astype('category')

In [25]:
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')
title_tokenizer = RegexpTokenizer(r'[a-zA-Z0-9\']+')
stop_words = text.ENGLISH_STOP_WORDS

def tokenize(text):
    return [stemmer.stem(word) for word in title_tokenizer.tokenize(text.lower()) if word not in stop_words]

In [33]:
vectorizer = TfidfVectorizer(tokenizer = tokenize, max_features = 1000, ngram_range=(1,2))
descriptions = df['description'].values
vectorizer.fit(descriptions)
description_encodings = vectorizer.transform(descriptions)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [34]:
vectorizer.vocabulary_

{'pepper': 651,
 'flavor wine': 373,
 'rare': 712,
 'bodi wine': 139,
 'plump': 682,
 'bacon': 83,
 'grenach': 438,
 'evolv': 327,
 'purpl': 704,
 'medium': 555,
 'aroma flavor': 68,
 'mourv dre': 583,
 'easi drink': 309,
 'bite': 110,
 'silki textur': 799,
 'classic': 212,
 'green': 434,
 'select': 788,
 'warm': 939,
 'standard': 841,
 'ripe wine': 760,
 'yeasti': 988,
 '2019': 12,
 'compon': 230,
 'appl pear': 62,
 'domin': 280,
 'drink': 289,
 'hazelnut': 447,
 'dark berri': 251,
 'plum flavor': 680,
 'live acid': 534,
 'lead': 505,
 'red currant': 726,
 'point': 685,
 'delici': 262,
 'bottl': 144,
 'produc': 696,
 'true': 913,
 'new oak': 600,
 'product': 697,
 'good acid': 421,
 'petit': 659,
 'rich fruit': 746,
 'chocol': 203,
 'charm': 188,
 'viognier': 934,
 'palat show': 641,
 'work': 981,
 'polish tannin': 687,
 'deepli': 258,
 'cherri plum': 199,
 'sparkl': 829,
 'headi': 448,
 'meyer lemon': 565,
 'ginger': 413,
 'appl': 59,
 'shape': 792,
 'shine': 794,
 'extra': 335,
 'pe

In [36]:
((df['description'] + " " + df['title']) ).values

array(["Fragrances suggest hay, crushed tomato vine and exotic fruit. The bright but structured palate delivers peach, papaya, cantaloupe and energizing mineral notes alongside fresh acidity. It's nicely balanced with good length, Tiefenbrunner 2012 Kirchleiten Sauvignon (Alto Adige)",
       'Packed with fruit and crisp acidity, this is a bright, light and perfumed wine. Red-berry flavors are lifted by red currants and a light spice. Drink now for total freshness. Bernard Reverdy et Fils 2014 Rosé (Sancerre)',
       'This easy, ruby-red wine displays fresh berry flavors and a light, crisp mouthfeel. Pair this no-fuss wine with homemade pasta sauce or potato gnocchi and cheese. Dievole 2009  Chianti Classico',
       ...,
       'This perfumed, herbal blend of Syrah and Grenache is tight and very crisp. It has intense tangy acidity and a lively fruity aftertaste. This is a bone-dry rosé with a desire for food. Château Roquefeuille 2016 Rosé in the Air Rosé (Côtes de Provence)',
      

In [12]:
description_encodings.shape

(10000, 1000)

In [8]:
tokenize(df['description'][0])

['fragranc',
 'suggest',
 'hay',
 'crush',
 'tomato',
 'vine',
 'exot',
 'fruit',
 'bright',
 'structur',
 'palat',
 'deliv',
 'peach',
 'papaya',
 'cantaloup',
 'energ',
 'miner',
 'note',
 'alongsid',
 'fresh',
 'acid',
 'it',
 'nice',
 'balanc',
 'good',
 'length']

In [4]:
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Fragrances suggest hay, crushed tomato vine an...",Kirchleiten,90,30.0,Northeastern Italy,Alto Adige,,Kerin O’Keefe,@kerinokeefe,Tiefenbrunner 2012 Kirchleiten Sauvignon (Alto...,Sauvignon,Tiefenbrunner
1,France,"Packed with fruit and crisp acidity, this is a...",,87,22.0,Loire Valley,Sancerre,,Roger Voss,@vossroger,Bernard Reverdy et Fils 2014 Rosé (Sancerre),Rosé,Bernard Reverdy et Fils
2,Italy,"This easy, ruby-red wine displays fresh berry ...",,86,,Tuscany,Chianti Classico,,,,Dievole 2009 Chianti Classico,Sangiovese,Dievole
3,US,Pretty in violet and rose petals this is a low...,Horseshoe Bend Vineyard,92,50.0,California,Russian River Valley,Sonoma,Virginie Boone,@vboone,Davis Family 2012 Horseshoe Bend Vineyard Pino...,Pinot Noir,Davis Family
4,US,This golden wine confounds in a mix of wet sto...,Dutton Ranch,93,38.0,California,Russian River Valley,Sonoma,Virginie Boone,@vboone,Dutton-Goldfield 2013 Dutton Ranch Chardonnay ...,Chardonnay,Dutton-Goldfield


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 13 columns):
country                  10000 non-null category
description              10000 non-null object
designation              7171 non-null object
points                   10000 non-null int64
price                    9323 non-null float64
province                 9994 non-null object
region_1                 8336 non-null object
region_2                 3853 non-null object
taster_name              8015 non-null object
taster_twitter_handle    7644 non-null object
title                    10000 non-null object
variety                  10000 non-null object
winery                   10000 non-null object
dtypes: category(1), float64(1), int64(1), object(10)
memory usage: 947.6+ KB


In [5]:
vectorizer = TfidfVectorizer()
vectorizer.fit(df['description'])
X = vectorizer.transform(df['description'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [6]:
X.shape

(10000, 11804)

In [18]:
enc = OneHotEncoder(handle_unknown='ignore')

In [19]:
enc.fit(df['country'])

ValueError: could not convert string to float: 'Australia'

In [20]:
country_encodings = pd.get_dummies(df['country'], prefix = 'country')

In [25]:
df = pd.concat([df, dfDummies], axis=1)

NameError: name 'dfDummies' is not defined

In [None]:
train_small_with_dummies = pd.get_dummies(train_small, sparse=True)
