# What kind of wine?

In [1]:
# data in s3 bucket as .parquet, train and test sets

# CONSTRUCT CLASSIFIER --> LABS ~5-8
# FIND WORDS MODEL IS USING TO PREDICT LABELS (MODEL COEFFICIENTS, INTERPRETATION THROUGH LIME) --> LAB 7 (notebook 3,4)
# AMOUNT OF TEST EXAMPLES EXCLUDED FOR F1>.81 --> LAB 7 (notebook 3)
# IMPROVE ACCURACY BY LABEL CHANGES, USE CONFUSION MATRIX FOR NEW LABELING SCHEME --> LAB 7 (notebook 2)

In [36]:
# our necessary packages imported
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm
import multiprocessing as mp

tqdm.pandas()

In [37]:
# data loaded

train = pd.read_parquet(
    "s3://ling583/wine-train.parquet", storage_options={"anon": True}
)
test = pd.read_parquet("s3://ling583/wine-test.parquet", storage_options={"anon": True})

In [38]:
train

Unnamed: 0,review_text,wine_variant
0,Rich smoky dark cherry nose very intense fruit...,Pinot Noir
1,Had this at Corton Restaurant in NYC. First of...,Syrah
2,"Nose is very tart, with a layer of sweet fruit...",Pinot Noir
3,Beautiful golden color. Discrete perfumed nose...,Chardonnay
4,Please take the time to decant: you will not b...,Pinot Noir
...,...,...
130492,Brought this out at a dinner and it was quite ...,Zinfandel
130493,Nothing bad to say except that this is so ordi...,Pinot Noir
130494,Good wine. Dark fruit and buttery oak aromas o...,Cabernet Sauvignon
130495,AP #8. Medium-deep gold. Mature nose of petrol...,Riesling


In [39]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)


def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if not (t.is_space or t.is_punct or t.like_num)]

In [40]:
# tokenizing review data

with mp.Pool() as p:
    train["tokens"] = pd.Series(p.imap(tokenize, tqdm(train["review_text"]), chunksize=500))
    test["tokens"] = pd.Series(p.imap(tokenize, tqdm(test["review_text"]), chunksize=500))

  0%|          | 0/130497 [00:00<?, ?it/s]

  0%|          | 0/32625 [00:00<?, ?it/s]

In [41]:
from warnings import simplefilter

simplefilter(action="ignore",category=FutureWarning)

In [42]:
train["wine_variant"].value_counts()

Pinot Noir            38471
Cabernet Sauvignon    30234
Chardonnay            19443
Syrah                 13704
Riesling               9683
Zinfandel              8327
Merlot                 5522
Sauvignon Blanc        5113
Name: wine_variant, dtype: int64

# Baseline Classifier 

In [44]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.pipeline import make_pipeline

In [45]:
baseline = make_pipeline(CountVectorizer(analyzer=identity), SGDClassifier())
baseline.fit(train["tokens"], train["wine_variant"])
base_predicted = baseline.predict(test["tokens"])
print(classification_report(test["wine_variant"], base_predicted))

                    precision    recall  f1-score   support

Cabernet Sauvignon       0.69      0.82      0.75      7558
        Chardonnay       0.83      0.84      0.84      4861
            Merlot       0.76      0.35      0.48      1381
        Pinot Noir       0.78      0.86      0.82      9618
          Riesling       0.84      0.77      0.80      2421
   Sauvignon Blanc       0.76      0.69      0.72      1278
             Syrah       0.71      0.55      0.62      3426
         Zinfandel       0.75      0.55      0.64      2082

          accuracy                           0.76     32625
         macro avg       0.77      0.68      0.71     32625
      weighted avg       0.76      0.76      0.75     32625



# Hyperparameter Search

In [47]:
import mlflow
from dask_ml.model_selection import RandomizedSearchCV
from logger import log_search
from scipy.stats.distributions import loguniform, randint, uniform

# used logger.py from previous labs

In [48]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:42369")
client

0,1
Client  Scheduler: tcp://127.0.0.1:42369  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [49]:
mlflow.set_experiment("project-2")
sgd = make_pipeline(
    CountVectorizer(analyzer=identity), TfidfTransformer(), SGDClassifier()
)

INFO: 'project-2' does not exist. Creating a new experiment


In [50]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 20),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "tfidftransformer__use_idf": [True, False],
        "sgdclassifier__alpha": loguniform(1e-6, 1e-2),
    },
    n_iter=50,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["wine_variant"])
log_search(search)

CPU times: user 11.1 s, sys: 1.56 s, total: 12.6 s
Wall time: 6min 29s


In [None]:
# 1.507610100544865e-05 for optimized alpha

# SGDClassifier

In [67]:
sgd = make_pipeline(
    CountVectorizer(analyzer=identity, min_df=12, max_df=.9),
    TfidfTransformer(use_idf=True),
    SGDClassifier(alpha=1e-5),
)
sgd.fit(train["tokens"], train["wine_variant"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["wine_variant"], predicted))

# 1.507610100544865e-05 is the alpha that performed best for these 50 runs, though I'll keep it as 1e-5 here

                    precision    recall  f1-score   support

Cabernet Sauvignon       0.69      0.83      0.75      7558
        Chardonnay       0.82      0.85      0.84      4861
            Merlot       0.82      0.33      0.48      1381
        Pinot Noir       0.77      0.87      0.82      9618
          Riesling       0.80      0.79      0.80      2421
   Sauvignon Blanc       0.82      0.67      0.74      1278
             Syrah       0.75      0.54      0.63      3426
         Zinfandel       0.83      0.53      0.64      2082

          accuracy                           0.76     32625
         macro avg       0.79      0.68      0.71     32625
      weighted avg       0.77      0.76      0.75     32625



Because this is, to me, suspiciously low for the macro-averaged f1 score, I'm going to try out at least one more model (Multinomial Naive Bayes for time&simplicity's sake)

# MultinomialNB

In [56]:
from sklearn.naive_bayes import MultinomialNB

In [68]:
mnb = make_pipeline(CountVectorizer(analyzer=identity, min_df=12,max_df=.9), MultinomialNB(alpha=1e-5))
mnb.fit(train["tokens"], train["wine_variant"])
predicted = mnb.predict(test["tokens"])
print(classification_report(test["wine_variant"], predicted))

                    precision    recall  f1-score   support

Cabernet Sauvignon       0.66      0.77      0.71      7558
        Chardonnay       0.84      0.80      0.82      4861
            Merlot       0.43      0.31      0.36      1381
        Pinot Noir       0.80      0.81      0.80      9618
          Riesling       0.75      0.77      0.76      2421
   Sauvignon Blanc       0.69      0.63      0.66      1278
             Syrah       0.63      0.54      0.58      3426
         Zinfandel       0.60      0.54      0.57      2082

          accuracy                           0.72     32625
         macro avg       0.67      0.65      0.66     32625
      weighted avg       0.72      0.72      0.72     32625



After playing around with min_df, max_df, and alpha, there were no convenient/good adjustments using Multinomial NB that would vastly improve the model that used an SGD classifier, thus I'm continuing with SGD. Instead, it's fair to assume I can improve my f1 score by adjusting parameters through another search and 50 additional runs (this would then be my "optimized model").

In [71]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 20),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "tfidftransformer__use_idf": [True, False],
        "sgdclassifier__alpha": [1.507610100544865e-05],
    },
    n_iter=50,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["wine_variant"])
log_search(search)

CPU times: user 12.4 s, sys: 1.53 s, total: 13.9 s
Wall time: 7min 5s


In [73]:
sgd = make_pipeline(
    CountVectorizer(analyzer=identity, min_df=2, max_df=.79),
    TfidfTransformer(use_idf=True),
    SGDClassifier(alpha=1.507610100544865e-05),
)
sgd.fit(train["tokens"], train["wine_variant"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["wine_variant"], predicted))

                    precision    recall  f1-score   support

Cabernet Sauvignon       0.70      0.82      0.75      7558
        Chardonnay       0.81      0.86      0.84      4861
            Merlot       0.78      0.34      0.47      1381
        Pinot Noir       0.78      0.86      0.82      9618
          Riesling       0.80      0.79      0.79      2421
   Sauvignon Blanc       0.83      0.67      0.74      1278
             Syrah       0.72      0.55      0.63      3426
         Zinfandel       0.83      0.52      0.64      2082

          accuracy                           0.76     32625
         macro avg       0.78      0.68      0.71     32625
      weighted avg       0.77      0.76      0.76     32625



In [74]:
base_f1 = f1_score(test["wine_variant"], base_predicted, average="macro")
sgd_f1 = f1_score(test["wine_variant"], predicted, average="macro")

In [82]:
base_f1, sgd_f1, sgd_f1 - base_f1


(0.707814004224339, 0.7101144095589997, 0.0023004053346606934)

In [84]:
# error reduction
(sgd_f1 - base_f1) / (1 - base_f1)

0.007873085527435524

We have come .7% of the way to perfection, which refers to closing the gap between baseline and a perfect 100%; however, this interpratation needs context and clarification, and is not just immediately significant without that.

In [85]:
from scipy.stats import binom_test, wilcoxon

In [87]:
diff = (predicted == test["wine_variant"]).astype(int) - (
    base_predicted == test["wine_variant"]
).astype(int)
sum(diff == 1), sum(diff == -1), sum(diff == 0)

(786, 668, 31171)

This is a way to see if classifiers agree, and which ones are right or wrong. 

In this case, out of entire dataset, 31171 were either both right or both wrong, 768 showed the baseline was wrong and the predicted was right, and 668 showed the baseline was right while the predicted was wrong.

Further investigation with binomial sign and wilcoxon tests below.

In [92]:
binom_test([sum(diff == 1), sum(diff == -1)], alternative="greater")


0.001070923144185815

In [94]:
wilcoxon(diff, alternative="greater")

WilcoxonResult(statistic=571815.0, pvalue=0.000985504093431113)

Overall, this optimized model isn't noticeably better, but just by comparing to the MultinomialNB classifier, it does perform better. It is optimized through hyperparameter searching with MLFlow plots.

# Saving the Model

In [95]:
import cloudpickle

In [96]:
sgd = make_pipeline(
    CountVectorizer(preprocessor=identity, tokenizer=tokenize, min_df=2, max_df=.79),
    TfidfTransformer(use_idf=True),
    SGDClassifier(alpha=1.507610100544865e-05),
)
sgd.fit(train["review_text"], train["wine_variant"])
predicted = sgd.predict(test["review_text"])
print(classification_report(test["wine_variant"], predicted))

                    precision    recall  f1-score   support

Cabernet Sauvignon       0.69      0.82      0.75      7558
        Chardonnay       0.83      0.85      0.84      4861
            Merlot       0.84      0.33      0.48      1381
        Pinot Noir       0.76      0.88      0.81      9618
          Riesling       0.81      0.79      0.80      2421
   Sauvignon Blanc       0.85      0.66      0.74      1278
             Syrah       0.76      0.53      0.63      3426
         Zinfandel       0.87      0.51      0.64      2082

          accuracy                           0.76     32625
         macro avg       0.80      0.67      0.71     32625
      weighted avg       0.77      0.76      0.75     32625



In [97]:
cloudpickle.dump(sgd, open("sgd.model", "wb"))

In [None]:
# cloudpickle as essentially an easier way to move my pipeline work to another notebook