In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re
import pandas as pd

In [2]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Tony\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tony\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data = pd.read_csv('winemag-data-130k-v2.csv')

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118561 entries, 0 to 118560
Data columns (total 14 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             118561 non-null  int64  
 1   country                118506 non-null  object 
 2   description            118561 non-null  object 
 3   designation            84430 non-null   object 
 4   points                 118560 non-null  float64
 5   price                  110327 non-null  float64
 6   province               118505 non-null  object 
 7   region_1               99152 non-null   object 
 8   region_2               45976 non-null   object 
 9   taster_name            94544 non-null   object 
 10  taster_twitter_handle  90014 non-null   object 
 11  title                  118560 non-null  object 
 12  variety                118559 non-null  object 
 13  winery                 118560 non-null  object 
dtypes: float64(2), int64(1), object(11)


In [6]:
data['description'][0]

"Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."

In [7]:
data.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87.0,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87.0,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87.0,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87.0,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87.0,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


## Quitar stopwords

Esto le quita palabras que no van a servir para mucho a nivel de analisis

In [8]:
en_stopwords = stopwords.words('english')

In [9]:
data['description_no_stopwords'] = data['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

In [10]:
data['description_no_stopwords']

0         Aromas include tropical fruit, broom, brimston...
1         This ripe fruity, wine smooth still structured...
2         Tart snappy, flavors lime flesh rind dominate....
3         Pineapple rind, lemon pith orange blossom star...
4         Much like regular bottling 2012, comes across ...
                                ...                        
118556    Here's soft, rather one-dimensional Pinot Noir...
118557    Classic, crispy white berry- lime-flavored win...
118558    Young, clean, fresh Vinho Verde, ready drink, ...
118559    Rides Moscato wave sweet honeyed richness, car...
118560    A soft, round Cabernet that's got leather, ced...
Name: description_no_stopwords, Length: 118561, dtype: object

## Quitar punctiation

Aqui lo que estamos haciendo es dejar todo lo que no sea palabras o espacios en blanco por medio de expresiones regulares

In [11]:
data['description_no_stopwords_no_punct'] = data.apply(lambda x: re.sub(r"[^\w\s]", "", x["description_no_stopwords"]), axis = 1)

In [12]:
data['description_no_stopwords_no_punct']

0         Aromas include tropical fruit broom brimstone ...
1         This ripe fruity wine smooth still structured ...
2         Tart snappy flavors lime flesh rind dominate S...
3         Pineapple rind lemon pith orange blossom start...
4         Much like regular bottling 2012 comes across r...
                                ...                        
118556    Heres soft rather onedimensional Pinot Noir dr...
118557    Classic crispy white berry limeflavored wine I...
118558    Young clean fresh Vinho Verde ready drink refr...
118559    Rides Moscato wave sweet honeyed richness carr...
118560    A soft round Cabernet thats got leather cedar ...
Name: description_no_stopwords_no_punct, Length: 118561, dtype: object

## Tokenizacion

Aqui lo que sucede es que hacemos que todas las oraciones se conviertan en listas

In [13]:
data['tokenize'] = data.apply(lambda x: word_tokenize(x['description_no_stopwords_no_punct']), axis = 1)

In [14]:
data['tokenize'][0]

['Aromas',
 'include',
 'tropical',
 'fruit',
 'broom',
 'brimstone',
 'dried',
 'herb',
 'The',
 'palate',
 'overly',
 'expressive',
 'offering',
 'unripened',
 'apple',
 'citrus',
 'dried',
 'sage',
 'alongside',
 'brisk',
 'acidity']

## Stemmer(hacer este o Lemmatized)

Esto lo que hace es quitar prefijos o sufijos para intentar dejar las palabras como si fueran su palabra base. Se puede usar esto o Lemmatized que es igual, pero mas rapido

In [15]:
ps = PorterStemmer()

In [16]:
data['stemmed'] = data['tokenize'].apply(lambda tokens: [ps.stem(token) for token in tokens])

In [17]:
data['stemmed'][0]

['aroma',
 'includ',
 'tropic',
 'fruit',
 'broom',
 'brimston',
 'dri',
 'herb',
 'the',
 'palat',
 'overli',
 'express',
 'offer',
 'unripen',
 'appl',
 'citru',
 'dri',
 'sage',
 'alongsid',
 'brisk',
 'acid']

## Lemmatized

In [18]:
lemmatized = WordNetLemmatizer()

In [19]:
data['lemmatized'] = data['tokenize'].apply(lambda tokens:[lemmatized.lemmatize(token) for token in tokens])

In [20]:
data['lemmatized'][0]

['Aromas',
 'include',
 'tropical',
 'fruit',
 'broom',
 'brimstone',
 'dried',
 'herb',
 'The',
 'palate',
 'overly',
 'expressive',
 'offering',
 'unripened',
 'apple',
 'citrus',
 'dried',
 'sage',
 'alongside',
 'brisk',
 'acidity']

In [21]:
flattened = [token for doc in data['lemmatized'] for token in doc]

## N-Grams

Aqui lo que estamos haciendo es contar la cantidad de veces que se repiten las palabras en 1 y 2 cantidades

In [22]:
unigrams = (pd.Series(nltk.ngrams(flattened, 1)).value_counts())
print(unigrams)

(wine,)               75258
(flavor,)             60244
(fruit,)              53158
(The,)                47966
(This,)               37505
                      ...  
(Rubyviolet,)             1
(17thcentury,)            1
(contactdelivers,)        1
(fasttrack,)              1
(stilldense,)             1
Name: count, Length: 47699, dtype: int64


In [23]:
bigrams = (pd.Series(nltk.ngrams(flattened, 2)).value_counts())
print(bigrams)

(The, palate)      8478
(black, cherry)    7152
(The, wine)        6298
(fruit, flavor)    5460
(This, wine)       4809
                   ... 
(nicest, part)        1
(spice, nicest)       1
(time, olive)         1
(good, Super)         1
(rooty, The)          1
Name: count, Length: 859103, dtype: int64
