In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
if 'google.colab' in str(get_ipython()):
  from google.colab import drive
  drive.mount('/gdrive')
  %cd ../gdrive/MyDrive/adsi_a2/
else:
  # set working directory
  os.chdir('/home/jovyan/work/')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/.shortcut-targets-by-id/1ejGfYDC-KnkyvYV7Tc23fm0JxjKhgVaI/adsi_a2


In [None]:
# nominate columns
col_tar = ['beer_style']
col_cat = ['brewery_name']
col_num = ['review_aroma', 'review_appearance', 'review_palate', 'review_taste']

In [None]:
filepath = 'data/processed/beer_review_cleaned.csv'

# check if cleaned file already exist
if os.path.exists(filepath):
    df_cleaned = pd.read_csv(filepath)
else:
    # load data from csv
    df = pd.read_csv('data/raw/beer_reviews.csv')
        
    # clean up dataset: drop unrelated columns and drop rows that contain NA 
    df_cleaned = df.copy()
    df_cleaned = df[col_cat + col_num + col_tar]
    
    # store data
    df_cleaned.dropna(inplace=True)
    df_cleaned.to_csv('data/processed/beer_review_cleaned.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split

# extract target label column
target = df_cleaned.pop(col_tar[0])

# split data
X_train, X_test, y_train, y_test = train_test_split(df_cleaned, target, test_size=0.2, random_state=42)

In [None]:
print(f'Total Prediction Categories: {len(set(target))}')

Total Prediction Categories: 104


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_tfm = Pipeline(steps=[('scaler', StandardScaler())])
cat_tfm = Pipeline(steps=[('one_hot_encoder', OneHotEncoder(sparse=False))])

In [None]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('col_num', num_tfm, col_num),
        ('col_cat', cat_tfm, col_cat)
    ]
)

In [None]:
from sklearn.ensemble import RandomForestClassifier

clr = RandomForestClassifier(
    n_jobs=2, 
    criterion='entropy',
    max_depth=3,
    random_state=42
)

rf_pipe = Pipeline(
    steps = [
        ('preprocessor', preprocessor),
        ('classify', clr)
    ]
)

In [None]:
X_trn1, X_trn2, y_trn1, y_trn2 = train_test_split(X_train, y_train, stratify=y_train, test_size=0.95, random_state=42)

In [None]:
rf_pipe.fit(X_trn1, y_trn1)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('col_num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['review_aroma',
                                                   'review_appearance',
                                                   'review_palate',
                                                   'review_taste']),
                                                 ('col_cat',
                                                  Pipeline(steps=[('one_hot_encoder',
                                                                   OneHotEncoder(sparse=False))]),
                                                  ['brewery_name'])])),
                ('classify',
                 RandomForestClassifier(criterion='entropy', max_depth=3,
                                        n_jo

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, balanced_accuracy_score
import seaborn as sns

# predict validation set
y_trn_pred = rf_pipe.predict(X_trn1)

# print evaluation metrics
accuracy_score(y_trn1, y_trn_pred)

0.07401162882309377