In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
os.chdir('../')

In [3]:
# nominate columns
col_tar = ['beer_style']
col_cat = ['brewery_name']
col_num = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste']

In [4]:
filepath = 'data/processed/beer_review_cleaned.csv'

# load data from csv
df = pd.read_csv('data/raw/beer_reviews.csv')

# clean up dataset: drop unrelated columns and drop rows that contain NA 
df_cleaned = df.copy()
df_cleaned = df[col_cat + col_num + col_tar]
# df_cleaned = df[col_num + col_tar]

# store data
df_cleaned.dropna(inplace=True)
df_cleaned.to_csv('data/processed/beer_review_cleaned.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [5]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1586599 entries, 0 to 1586613
Data columns (total 6 columns):
brewery_name         1586599 non-null object
review_aroma         1586599 non-null float64
review_appearance    1586599 non-null float64
review_palate        1586599 non-null float64
review_taste         1586599 non-null float64
beer_style           1586599 non-null object
dtypes: float64(4), object(2)
memory usage: 84.7+ MB


In [6]:
dfx = df_cleaned.groupby(['brewery_name']).\
    agg({'review_palate':['count']}).\
    review_palate.\
    sort_values('count', ascending=False)

dfx.head(10)

Unnamed: 0_level_0,count
brewery_name,Unnamed: 1_level_1
Boston Beer Company (Samuel Adams),39444
Dogfish Head Brewery,33839
Stone Brewing Co.,33066
Sierra Nevada Brewing Co.,28751
"Bell's Brewery, Inc.",25191
Rogue Ales,24083
Founders Brewing Company,20004
Victory Brewing Company,19479
Lagunitas Brewing Company,16837
Avery Brewing Company,16107


In [7]:
from sklearn.model_selection import train_test_split

# remove brewery name as it's irrelevant
bco = df_cleaned.pop('brewery_name')

# extract target label column
target = df_cleaned.pop(col_tar[0])

In [8]:
# split data
X_train, X_test, y_train, y_test = train_test_split(df_cleaned, target, stratify=target, test_size=0.2, random_state=42)

In [9]:
print(f'Total Prediction Categories: {len(set(target))}')

Total Prediction Categories: 104


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_tfm = Pipeline(steps=[('scaler', StandardScaler())])
cat_tfm = Pipeline(steps=[('one_hot_encoder', OneHotEncoder(sparse=False))])

In [11]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('col_num', num_tfm, col_num)
    ]
)

In [12]:
from sklearn.ensemble import RandomForestClassifier

clr = RandomForestClassifier(
    n_jobs=-1, 
    criterion='entropy',
    max_depth=3,
    random_state=42
)

rf_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classify', clr)
    ]
)

In [17]:
X_trn1, X_trn2, y_trn1, y_trn2 = train_test_split(X_train, y_train, stratify=y_train, test_size=0.2, random_state=42)

In [21]:
rf_pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('col_num',
                                                  Pipeline(memory=None,
                                                           steps=[('scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True))],
                                                           verbose=False),
                                                  ['review_aroma',
                                                   'review_appearance',
                                              

In [24]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
import seaborn as sns

# predict validation set
y_trn_pred = rf_pipe.predict(X_trn2)

# print evaluation metrics
accuracy_score(y_trn2, y_trn_pred)

0.08907018152023194

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
import seaborn as sns

# predict validation set
y_trn_pred = rf_pipe.predict(X_test)

# print evaluation metrics
accuracy_score(y_test, y_trn_pred)

0.0894302281608471

In [26]:
from joblib import dump

dump(rf_pipe, 'models/random_forest_base.joblib')

['models/random_forest_base.joblib']