# Pipeline

1. Preprocessing
2. Cosine Similarity / Nearest Neighbors
3. Build / Test Models

## 1. Preprocessing

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3

In [38]:
db_path = '../data/beer.db'
conn = sqlite3.connect(db_path)

query = "SELECT * FROM user_extract"
df = pd.read_sql(query, conn)

In [39]:
# 1. remove duplicates 
df = df[~df.duplicated()]

In [40]:
# 2. one-hot encode categorical variables
categorical_variables = ['beer_description', 'brewery']
for cat_var in categorical_variables:
    dummies = pd.get_dummies(df[cat_var], drop_first=True, prefix=cat_var)
    df = pd.merge(df, dummies, left_index=True, right_index=True)

In [41]:
# 4. flag outliers
features = ['ABV', 'global_rating', 'user_rating', 'IBU']
skipnas = True

for feature in features:
    try:
        q1 = df[feature].quantile(.25)
        q3 = df[feature].quantile(.75)
        iqr = q3 - q1
        non_outlier_mask = (df[feature] >= q1 - 1.5*iqr) & (df[feature] <= q3 + 1.5*iqr)
        outliers = df[~non_outlier_mask]

        print("FEATURE {}".format(feature))
        print("num of outliers = {:,d}".format(len(outliers)))
        print("% of outliers = {:.2f}%".format(100*len(outliers)/len(df)))
        print("\n")
    except TypeError:
        print("FEATURE {}".format(feature))
        print("ANALYZING ALL NON-NA VALUES")
        
        non_nas = df[~df[feature].isna()][feature].astype(float)
        q1 = non_nas.quantile(.25)
        q3 = non_nas.quantile(.75)
        iqr = q3 - q1
        non_outlier_mask = (non_nas >= q1 - 1.5*iqr) & (non_nas <= q3 + 1.5*iqr)
        outliers = non_nas[~non_outlier_mask]
        print("num of outliers = {:,d}".format(len(outliers)))
        print("% of outliers = {:.2f}%".format(100*len(outliers)/len(non_nas)))
        print("\n")

FEATURE ABV
num of outliers = 3,421
% of outliers = 3.33%


FEATURE global_rating
num of outliers = 3,648
% of outliers = 3.56%


FEATURE user_rating
num of outliers = 11,267
% of outliers = 10.98%


FEATURE IBU
SKIPPING NA VALUES
num of outliers = 557
% of outliers = 1.00%




In [42]:
# 4. Impute missing values 
features = ['ABV', 'global_rating', 'user_rating', 'IBU']
impute_method = 'mean'

for feature in features:
    if impute_method == 'mean':
        non_nas = df[~df[feature].isna()][feature].astype(float)
        feature_mean = non_nas.mean()
        df[feature] = df[feature].fillna(feature_mean)

## 2. Cosine Similarity / Nearest Neighbors

In [128]:
# 1. Create User-Item Matrix 
fill_method = 'item_mean'

data = df
values = 'user_rating'
index = 'username'
columns = 'beer_name'
agg_func = 'mean'

if fill_method == 'item_mean':
    ui_matrix = pd.pivot_table(data=data, values=values, index=index, 
                               columns=columns, aggfunc=agg_func)
    ui_matrix = ui_matrix.fillna(ui_matrix.mean(axis=0), axis=0)

elif fill_method == 'user_mean':
    ui_matrix = pd.pivot_table(data=data, values=values, index=index, 
                               columns=columns, aggfunc=agg_func)
    ui_matrix.apply(lambda row: row.fillna(row.mean()), axis=1)

elif fill_method == 0:
    ui_matrix = pd.pivot_table(data=data, values=values, index=index, 
                               columns=columns, aggfunc=agg_func, fill_value=0)
else:
    raise ValueError("Please checkout 'fill_method' value")

ui_matrix.columns = list(ui_matrix.columns)

In [129]:
# 2. Scale / Standardize 
scale_standardize = 'standardize'

if scale_standardize == 'scale':
    print('Data is already scaled from 0 to 5')
elif scale_standardize == 'standardize':
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    standardized_ui_matrix = pd.DataFrame(scaler.fit_transform(ui_matrix))
    standardized_ui_matrix.index = ui_matrix.index
    standardized_ui_matrix.columns = ui_matrix.columns
    ui_matrix = standardized_ui_matrix
elif scale_standardize == None:
    print("Skipping scaling / standardization")
else:
    raise ValueError("Please checkout 'scale_standardize' value")

In [133]:
# Calculate Cosine Similarity 
user_of_reference = 'tsharp93'

from sklearn.metrics.pairwise import cosine_similarity
X = ui_matrix[ui_matrix.index == user_of_reference]
Y = ui_matrix[ui_matrix.index != user_of_reference]

sim = cosine_similarity(X,Y)[0].tolist()
names = Y.index

sim_df = pd.DataFrame({'username':names, 'sim_score':sim})

In [134]:
sim_df.sort_values(by='sim_score', ascending=False).head()

Unnamed: 0,username,sim_score
18,Gmlman,0.857681
34,Mikeylga,0.85623
39,MythicMan57,0.845642
94,junana71,0.842739
98,kwitnes,0.84031
