## Ale Classification - Exploration

In [1]:
# Standard DS imports
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Modeling
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier

# Display more dataframe rows if necessary
pd.options.display.max_rows = 500
pd.options.display.max_columns = 5_000

# Set random_seed
np.random.seed(14)

In [2]:
beer_data = pd.read_csv('./data/dmv_full_beer_data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
beer_data.head(3)

Unnamed: 0,beer,user,date,rating,review_text,abv,address,brewery,country,date_added,google_map_link,link,num_ratings,postal_code,agg_rating,state,style,town,abv_range,category,year_added,ale,text_stemmed,text_lemmatized
0,3 Stars #ultrafresh,345552,2019-04-11,4.2,can from whole foods market fair lakes in fair...,0.083,"6400 Chillum Pl, NW",3 Stars Brewing Company,USA,6/3/2016,maps.google.com/maps?f=q&hl=en&geocode=&q=3+St...,www.ratebeer.com/beer/3-stars-ultrafresh/423469/,14,20011,3.47,DC,IIPA - Imperial / Double IPA,Washington,8% and more,IPA,2016,1,can from whole food market fair lake in fairfa...,can from whole food market fair lake in fairfa...
1,3 Stars #ultrafresh,44939,2019-01-25,3.7,canned clouded gold pour plenty of citrus slig...,0.083,"6400 Chillum Pl, NW",3 Stars Brewing Company,USA,6/3/2016,maps.google.com/maps?f=q&hl=en&geocode=&q=3+St...,www.ratebeer.com/beer/3-stars-ultrafresh/423469/,14,20011,3.47,DC,IIPA - Imperial / Double IPA,Washington,8% and more,IPA,2016,1,can cloud gold pour plenti of citru slightli b...,canned clouded gold pour plenty of citrus slig...
2,3 Stars #ultrafresh,87939,2018-09-20,2.9,draft at beer culture murky golden with a whit...,0.083,"6400 Chillum Pl, NW",3 Stars Brewing Company,USA,6/3/2016,maps.google.com/maps?f=q&hl=en&geocode=&q=3+St...,www.ratebeer.com/beer/3-stars-ultrafresh/423469/,14,20011,3.47,DC,IIPA - Imperial / Double IPA,Washington,8% and more,IPA,2016,1,draft at beer cultur murki golden with a white...,draft at beer culture murky golden with a whit...


In [4]:
X = beer_data['review_text']
y = beer_data['ale']

In [5]:
y.value_counts(normalize=True)

1    0.561588
0    0.438412
Name: ale, dtype: float64

In [6]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [7]:
# Instiating Multinomial Naive Bayes Model
nb = MultinomialNB()

# Setting up pipeline with TFIDF and Naive Bayes
pipe_naive_tf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [8]:
# Grid search parameters
params_nb = {
    'tfidf__max_features': [200_000],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__stop_words': ['english', None],
    'nb__alpha': [.1, .25, .5]
}

In [9]:
# Instantiate and fit GridSearch on pipeline with TFIDF and Naive Bayes
grid_nb = GridSearchCV(pipe_naive_tf, params_nb, cv=3, verbose=1)
grid_nb.fit(X_train, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  3.1min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...True,
        vocabulary=None)), ('nb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tfidf__max_features': [200000], 'tfidf__ngram_range': [(1, 2)], 'tfidf__stop_words': ['english', None], 'nb__alpha': [0.1, 0.25, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [10]:
print(f"The best parameters are: {grid_nb.best_params_}")
print(f"The best cross-val score is: {grid_nb.best_score_:.4f}")

The best parameters are: {'nb__alpha': 0.1, 'tfidf__max_features': 200000, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}
The best cross-val score is: 0.7970


In [11]:
print(f"The training score for the best parameters is: {grid_nb.score(X_train, y_train):.4f}")
print(f"The test score for best parameters is: {grid_nb.score(X_test, y_test):.4f}")

The training score for the best parameters is: 0.9048
The test score for best parameters is: 0.7997


Naive Bayes did decently well, handily beating the baseline but suffers from overfitting.

In [12]:
# Instiating Random Forest Model
rf = RandomForestClassifier()

# Setting up pipeline with TFIDF and Random Forest
pipe_rf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('rf', RandomForestClassifier())
])

In [13]:
# Grid search parameters
params_rf = {
    'tfidf__max_features': [20_000, 50_000],
    'tfidf__ngram_range': [(1,2)],
    'tfidf__stop_words': ['english'],
    'rf__max_depth': [5, 10, 20],
    'rf__n_estimators': [10, 50]
}

In [14]:
# Instantiate and fit GridSearch on pipeline with TFIDF and Naive Bayes
grid_rf = GridSearchCV(pipe_rf, params_rf, cv=3, verbose=1)
grid_rf.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:  6.2min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'tfidf__max_features': [20000, 50000], 'tfidf__ngram_range': [(1, 2)], 'tfidf__stop_words': ['english'], 'rf__max_depth': [5, 10, 20], 'rf__n_estimators': [10, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [15]:
print(f"The best parameters are: {grid_rf.best_params_}")
print(f"The best cross-val score is: {grid_rf.best_score_:.4f}")

The best parameters are: {'rf__max_depth': 20, 'rf__n_estimators': 50, 'tfidf__max_features': 20000, 'tfidf__ngram_range': (1, 2), 'tfidf__stop_words': 'english'}
The best cross-val score is: 0.7460


In [16]:
print(f"The training score for the best parameters is: {grid_rf.score(X_train, y_train):.4f}")
print(f"The test score for best parameters is: {grid_rf.score(X_test, y_test):.4f}")

The training score for the best parameters is: 0.7573
The test score for best parameters is: 0.7409


## Conclusions

This was not my main objective of this project, but it an interesting concept to look at classification of certain types of beers. I clearly need to tune hyperparameters due to variance/bias tradeoffs and explore additional classification modeling techniques, but it's another interesting beer top to investigate.