# UntappdML

Project to export personal beer history from Untappd and try to apply a machine learning model to predict if you like a new beer or not.

### import

In [None]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import style
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from joblib import dump, load

### load personal untappd beer history

You must be a supporter to be able to export beer history. Here we use the CSV format.

In [None]:
history = pd.read_csv("untappd.csv")

In [None]:
history.head(3)

### drop unwanted columns

Ignore columns which won't make sense in the prediction.

In [None]:
history.columns

In [None]:
history = history[[
    "beer_type",
    "beer_abv",
    "beer_ibu",
    "rating_score",
    "brewery_country",
    "global_rating_score"
]]

In [None]:
history.head(3)

### feature: beer type

In [None]:
history[["beer_type"]].value_counts()

In [None]:
history.loc[history.beer_type.str.contains("ipa", flags=re.IGNORECASE), "beer_type"] = "0"
history.loc[history.beer_type.str.contains("lager", flags=re.IGNORECASE)| history.beer_type.str.contains("pilsner", flags=re.IGNORECASE), "beer_type"] = "1"
history.loc[history.beer_type.str.contains("stout", flags=re.IGNORECASE), "beer_type"] = "2"
history.loc[history.beer_type.str.contains("pale ale", flags=re.IGNORECASE), "beer_type"] = "3"
history.loc[history.beer_type.str.contains("wheat", flags=re.IGNORECASE) | history.beer_type.str.contains("hefe", flags=re.IGNORECASE) | history.beer_type.str.contains("weizen", flags=re.IGNORECASE), "beer_type"] = "4"
history.loc[history.beer_type.str.contains("porter", flags=re.IGNORECASE), "beer_type"] = "5"
history.loc[history.beer_type.str.contains("sour", flags=re.IGNORECASE), "beer_type"] = "6"
history.loc[history.beer_type.str.contains("fruit", flags=re.IGNORECASE), "beer_type"] = "7"
history.loc[~history.beer_type.str.contains("0|1|2|3|4|5|6|7"), "beer_type"] = "8"
history["beer_type"] = history["beer_type"].astype(int)

In [None]:
sns.histplot(data=history, x="beer_type")

### feature: beer abv

In [None]:
sns.histplot(data=history, x="beer_abv", bins=20)

In [None]:
history.loc[history["beer_abv"] <= 4.5, "beer_abv"] = 0
history.loc[(history["beer_abv"] > 4.5) & (history["beer_abv"] <= 5), "beer_abv"] = 1
history.loc[(history["beer_abv"] > 5) & (history["beer_abv"] <= 5.5), "beer_abv"] = 2
history.loc[(history["beer_abv"] > 5.5) & (history["beer_abv"] <= 6), "beer_abv"] = 3
history.loc[(history["beer_abv"] > 6) & (history["beer_abv"] <= 7), "beer_abv"] = 4
history.loc[(history["beer_abv"] > 7) & (history["beer_abv"] <= 8), "beer_abv"] = 5
history.loc[history["beer_abv"] > 8, "beer_abv"] = 6
history["beer_abv"] = history["beer_abv"].astype(int)

In [None]:
sns.histplot(data=history, x="beer_abv")

### feature: beer ibu

In [None]:
sns.histplot(data=history, x="beer_ibu", bins=40)

In [None]:
history.loc[history["beer_ibu"] == 0, "beer_ibu"] = 0
history.loc[(history["beer_ibu"] > 0) & (history["beer_ibu"] <= 20), "beer_ibu"] = 1
history.loc[(history["beer_ibu"] > 20) & (history["beer_ibu"] <= 30), "beer_ibu"] = 2
history.loc[(history["beer_ibu"] > 30) & (history["beer_ibu"] <= 40), "beer_ibu"] = 3
history.loc[(history["beer_ibu"] > 40) & (history["beer_ibu"] <= 60), "beer_ibu"] = 4
history.loc[(history["beer_ibu"] > 60) & (history["beer_ibu"] <= 80), "beer_ibu"] = 5
history.loc[history["beer_ibu"] > 80, "beer_ibu"] = 6
history["beer_ibu"] = history["beer_ibu"].astype(int)

In [None]:
sns.histplot(data=history, x="beer_ibu")

### feature: brewery country

In [None]:
history[["brewery_country"]].value_counts()

In [None]:
history.loc[history.brewery_country.str.contains("united states|canada", flags=re.IGNORECASE), "brewery_country"] = "0"
history.loc[history.brewery_country.str.contains("scotland|united kingdom|wales", flags=re.IGNORECASE), "brewery_country"] = "1"
history.loc[history.brewery_country.str.contains("germany|austria", flags=re.IGNORECASE), "brewery_country"] = "2"
history.loc[history.brewery_country.str.contains("finland|sweden|norway|denmark|iceland", flags=re.IGNORECASE), "brewery_country"] = "3"
history.loc[history.brewery_country.str.contains("belgium|france|netherlands", flags=re.IGNORECASE), "brewery_country"] = "4"
history.loc[history.brewery_country.str.contains("czech republic|slovakia", flags=re.IGNORECASE), "brewery_country"] = "5"
history.loc[~history.brewery_country.str.contains("0|1|2|3|4|5"), "brewery_country"] = "6"
history["brewery_country"] = history["brewery_country"].astype(int)

In [None]:
sns.histplot(data=history, x="brewery_country")

### drop nulls

In [None]:
history = history.dropna()

### label: rating score

This is what we want to predict. Take the mean of all personal ratings, and apply a label of "like" if above or equal to the mean, and "not like" if below.

In [None]:
m = history["rating_score"].mean(axis=0)

In [None]:
m

In [None]:
history.loc[history["rating_score"] < m, "rating_score"] = 0
history.loc[history["rating_score"] >= m, "rating_score"] = 1
history["rating_score"] = history["rating_score"].astype(int)

In [None]:
sns.histplot(data=history, x="rating_score", bins=2)

### train/test split

Split the entire history into a traing and testing data set.

In [None]:
x = history.drop("rating_score", axis=1)
y = history[["rating_score"]]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### random forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(x_train, y_train)

In [None]:
y_pred = random_forest.predict(x_test)

In [None]:
random_forest.score(x_train, y_train)
acc_random_forest = round(random_forest.score(x_train, y_train) * 100, 2)

In [None]:
acc_random_forest

### feature importance

In [None]:
importances = pd.DataFrame({"feature": x_train.columns, "importance": np.round(random_forest.feature_importances_, 3)})
importances = importances.sort_values("importance", ascending=False).set_index("feature")
importances.head(15)

In [None]:
importances.plot.bar()

### precision and recall

In [None]:
predictions = cross_val_predict(random_forest, x_train, y_train, cv=3)
confusion_matrix(y_train, predictions)

In [None]:
print("Precision:", precision_score(y_train, predictions))
print("Recall:",recall_score(y_train, predictions))

### f1-score

In [None]:
f1_score(y_train, predictions)

### persist model

In [None]:
dump(random_forest, "random_forest.joblib")