In [1]:
import numpy as np
import pandas as pd

import os
print(os.listdir("./input"))

['winemag-data-130k-v2.csv', 'winemag-data-130k-v2.json', 'winemag-data_first150k.csv']


In [2]:
data=pd.read_csv('./input/winemag-data-130k-v2.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129971 entries, 0 to 129970
Data columns (total 14 columns):
Unnamed: 0               129971 non-null int64
country                  129908 non-null object
description              129971 non-null object
designation              92506 non-null object
points                   129971 non-null int64
price                    120975 non-null float64
province                 129908 non-null object
region_1                 108724 non-null object
region_2                 50511 non-null object
taster_name              103727 non-null object
taster_twitter_handle    98758 non-null object
title                    129971 non-null object
variety                  129970 non-null object
winery                   129971 non-null object
dtypes: float64(1), int64(2), object(11)
memory usage: 13.9+ MB


In [4]:
total = data.isnull().sum().sort_values(ascending = False)
percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False)
missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data

Unnamed: 0,Total,Percent
region_2,79460,61.136715
designation,37465,28.825661
taster_twitter_handle,31213,24.015357
taster_name,26244,20.192197
region_1,21247,16.347493
price,8996,6.921544
province,63,0.048472
country,63,0.048472
variety,1,0.000769
winery,0,0.0


In [5]:

data=data.dropna(subset=['price'])

In [6]:
print("Total number of examples: ", data.shape[0])
print("Number of examples with the same title and description: ", data[data.duplicated(['description','title'])].shape[0])

Total number of examples:  120975
Number of examples with the same title and description:  9382


In [7]:
data=data.drop_duplicates(['description','title'])
data=data.reset_index(drop=True)

In [8]:
data=data.fillna(-1)

In [9]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import string
from wordcloud import WordCloud, STOPWORDS
import re

from nltk.tokenize import RegexpTokenizer


In [10]:
data['description']= data['description'].str.lower()
data['description']= data['description'].apply(lambda elem: re.sub('[^a-zA-Z]',' ', elem))  
data['description']

0         this is ripe and fruity  a wine that is smooth...
1         tart and snappy  the flavors of lime flesh and...
2         pineapple rind  lemon pith and orange blossom ...
3         much like the regular bottling from       this...
4         blackberry and raspberry aromas show a typical...
5         here s a bright  informal red that opens with ...
6         this dry and restrained wine offers spice in p...
7         savory dried thyme notes accent sunnier flavor...
8         this has great depth of flavor with its fresh ...
9         soft  supple plum envelopes an oaky structure ...
10        this is a dry wine  very spicy  with a tight  ...
11        slightly reduced  this wine offers a chalky  t...
12        building on     years and six generations of w...
13        zesty orange peels and apple notes abound in t...
14        baked plum  molasses  balsamic vinegar and che...
15        raw black cherry aromas are direct and simple ...
16        desiccated blackberry  leather

In [11]:
tokenizer = RegexpTokenizer(r'\w+')
words_descriptions = data['description'].apply(tokenizer.tokenize)
words_descriptions.head()

0    [this, is, ripe, and, fruity, a, wine, that, i...
1    [tart, and, snappy, the, flavors, of, lime, fl...
2    [pineapple, rind, lemon, pith, and, orange, bl...
3    [much, like, the, regular, bottling, from, thi...
4    [blackberry, and, raspberry, aromas, show, a, ...
Name: description, dtype: object

In [12]:
all_words = [word for tokens in words_descriptions for word in tokens]
data['description_lengths']= [len(tokens) for tokens in words_descriptions]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))

4624968 words total, with a vocabulary size of 29486


In [13]:
from collections import Counter
count_all_words = Counter(all_words)
count_all_words.most_common(100)

[('and', 302908),
 ('the', 190834),
 ('a', 154824),
 ('of', 149861),
 ('with', 104095),
 ('this', 98014),
 ('is', 81926),
 ('it', 74638),
 ('wine', 66708),
 ('flavors', 55626),
 ('in', 55172),
 ('to', 48455),
 ('s', 46898),
 ('fruit', 42627),
 ('on', 40239),
 ('that', 34359),
 ('aromas', 34293),
 ('palate', 33563),
 ('finish', 30983),
 ('acidity', 28935),
 ('from', 27774),
 ('but', 27565),
 ('tannins', 25883),
 ('drink', 25692),
 ('cherry', 25586),
 ('black', 24936),
 ('are', 22572),
 ('ripe', 22538),
 ('has', 20419),
 ('for', 19024),
 ('red', 18603),
 ('by', 17485),
 ('notes', 16619),
 ('spice', 16210),
 ('oak', 16022),
 ('an', 15673),
 ('as', 15504),
 ('its', 15195),
 ('dry', 15044),
 ('nose', 14962),
 ('now', 14954),
 ('rich', 14690),
 ('berry', 14530),
 ('fresh', 14506),
 ('full', 13629),
 ('plum', 13077),
 ('sweet', 11813),
 ('apple', 11652),
 ('blend', 11580),
 ('soft', 11563),
 ('blackberry', 11319),
 ('well', 11317),
 ('white', 11010),
 ('fruits', 10844),
 ('light', 10839),
 ('

In [14]:
stopword_list = stopwords.words('english')
ps = PorterStemmer()
words_descriptions = words_descriptions.apply(lambda elem: [word for word in elem if not word in stopword_list])
words_descriptions = words_descriptions.apply(lambda elem: [ps.stem(word) for word in elem])
data['description_cleaned'] = words_descriptions.apply(lambda elem: ' '.join(elem))

In [15]:
all_words = [word for tokens in words_descriptions for word in tokens]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))
count_all_words = Counter(all_words)
count_all_words.most_common(100)

2822364 words total, with a vocabulary size of 21073


[('wine', 69125),
 ('flavor', 62686),
 ('fruit', 53836),
 ('finish', 35863),
 ('aroma', 35564),
 ('palat', 33674),
 ('acid', 33330),
 ('cherri', 29505),
 ('drink', 28905),
 ('tannin', 27717),
 ('black', 24963),
 ('ripe', 24037),
 ('dri', 22844),
 ('note', 21892),
 ('spice', 20040),
 ('red', 18821),
 ('rich', 18382),
 ('fresh', 18095),
 ('berri', 16569),
 ('oak', 16557),
 ('show', 15940),
 ('nose', 14976),
 ('plum', 14252),
 ('sweet', 13919),
 ('full', 13729),
 ('offer', 13698),
 ('blackberri', 13395),
 ('textur', 13370),
 ('blend', 13280),
 ('appl', 13155),
 ('balanc', 13005),
 ('bodi', 13003),
 ('soft', 12045),
 ('age', 11719),
 ('crisp', 11409),
 ('well', 11328),
 ('white', 11150),
 ('light', 11149),
 ('dark', 10653),
 ('structur', 10643),
 ('citru', 10109),
 ('raspberri', 9909),
 ('cabernet', 9858),
 ('vanilla', 9829),
 ('hint', 9750),
 ('herb', 9717),
 ('miner', 9669),
 ('fruiti', 9653),
 ('bright', 9380),
 ('give', 9222),
 ('pepper', 9131),
 ('touch', 8885),
 ('lemon', 8666),
 ('y

In [18]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostRegressor, cv

def prepare_dataframe(vect, data, features=True):
    vectorized=vect.fit_transform(data['description_cleaned']).toarray()
    vectorized=pd.DataFrame(vectorized)
    if features == True:
        X=data.drop(columns=['points','Unnamed: 0','description','description_cleaned'])
        X=X.fillna(-1)
        print(X.columns)
        X=pd.concat([X.reset_index(drop=True),vectorized.reset_index(drop=True)],axis=1)
        categorical_features_indices =[0,1,3,4,5,6,7,8,9,10]
    else:
        X=vectorized
        categorical_features_indices =[]
    y=data['points']
    return X,y,categorical_features_indices

In [19]:
#model definintion and training.
def perform_model(X_train, y_train,X_valid, y_valid,X_test, y_test,categorical_features_indices,name):
    model = CatBoostRegressor(
        random_seed = 100,
        loss_function = 'RMSE',
        iterations=800,
    )
    
    model.fit(
        X_train, y_train,
        cat_features = categorical_features_indices,
        verbose=False,
        eval_set=(X_valid, y_valid)
    )
    
    print(name+" technique RMSE on training data: "+ model.score(X_train, y_train).astype(str))
    print(name+" technique RMSE on test data: "+ model.score(X_test, y_test).astype(str))
    

In [20]:
def prepare_variable(vect, data, features_append=True):
    X, y , categorical_features_indices = prepare_dataframe(vect, data,features_append)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, 
                                                        random_state=42)
    X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, 
                                                        random_state=52)
    return X_train, y_train,X_valid, y_valid,X_test, y_test, categorical_features_indices

In [21]:
vect= CountVectorizer(analyzer='word', token_pattern=r'\w+',max_features=500)
training_variable=prepare_variable(vect, data)
perform_model(*training_variable, 'Bag of Words Counts')

Index(['country', 'designation', 'price', 'province', 'region_1', 'region_2',
       'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery',
       'description_lengths'],
      dtype='object')
Bag of Words Counts technique RMSE on training data: 1.506708311870242
Bag of Words Counts technique RMSE on test data: 1.5843565629879326


In [22]:
vect= TfidfVectorizer(analyzer='word', token_pattern=r'\w+',max_features=500)
training_variable=prepare_variable(vect, data)
                                                                                                                                                                                                                                                                                                                                                                                                                                                                        perform_model(*training_variable, 'TF-IDF')

Index(['country', 'designation', 'price', 'province', 'region_1', 'region_2',
       'taster_name', 'taster_twitter_handle', 'title', 'variety', 'winery',
       'description_lengths'],
      dtype='object')
TF-IDF technique RMSE on training data: 1.5099931287233164
TF-IDF technique RMSE on test data: 1.5853094671747783


In [23]:
vect= CountVectorizer(analyzer='word', token_pattern=r'\w+',max_features=500)
training_variable=prepare_variable(vect, data, False)
perform_model(*training_variable, 'Bag of Words Counts')

Bag of Words Counts technique RMSE on training data: 2.06001693488053
Bag of Words Counts technique RMSE on test data: 2.0799062244442212


In [24]:
vect= TfidfVectorizer(analyzer='word', token_pattern=r'\w+',max_features=500)
training_variable=prepare_variable(vect, data, False)
perform_model(*training_variable, 'TF-IDF')

TF-IDF technique RMSE on training data: 2.0218508680522858
TF-IDF technique RMSE on test data: 2.0412736801000078
