In [1]:
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

# Machine Learning Approach

In [2]:
# loading the wine data
df = pd.read_csv('data/winemag-data-130k-v2.csv')

## Data Exploration

In [3]:
len(df.variety.unique().tolist())

708

In [4]:
# wine variety and data counts
df.variety.value_counts()

Pinot Noir                        13272
Chardonnay                        11753
Cabernet Sauvignon                 9472
Red Blend                          8946
Bordeaux-style Red Blend           6915
Riesling                           5189
Sauvignon Blanc                    4967
Syrah                              4142
Rosé                               3564
Merlot                             3102
Nebbiolo                           2804
Zinfandel                          2714
Sangiovese                         2707
Malbec                             2652
Portuguese Red                     2466
White Blend                        2360
Sparkling Blend                    2153
Tempranillo                        1810
Rhône-style Red Blend              1471
Pinot Gris                         1455
Champagne Blend                    1396
Cabernet Franc                     1353
Grüner Veltliner                   1345
Portuguese White                   1159
Bordeaux-style White Blend         1066


## Preparing the data

In [5]:
# taking the top 10 variety
counter = Counter(df['variety'].tolist())
top_10_varieties = {i[0]: idx for idx, i in enumerate(counter.most_common(10))}
df = df[df['variety'].map(lambda x: x in top_10_varieties)]

In [6]:
# getting the description text
description_list = df['description'].tolist()
varietal_list = [top_10_varieties[i] for i in df['variety'].tolist()]
varietal_list = np.array(varietal_list)

In [6]:
# finding the tfidf matrix
count_vect = CountVectorizer()
x_train_counts = count_vect.fit_transform(description_list)
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(x_train_counts)

# splitting the data into train/test
train_x, test_x, train_y, test_y = train_test_split(x_train_tfidf, varietal_list, test_size=0.3)

## Training our ML model

In [9]:
# training the model
clf = MultinomialNB().fit(train_x, train_y)
# clf = SVC(kernel='linear').fit(train_x, train_y)

## Calculating the evaluation metrices

In [10]:
y_score = clf.predict(test_x)

# finding the accuracy number
n_right = 0
for i in range(len(y_score)):
    if y_score[i] == test_y[i]:
        n_right += 1

print("Accuracy: %.2f%%" % ((n_right/float(len(test_y)) * 100)))