In [1]:
# Imports
import pickle

import pandas as pd
from fuzzywuzzy import fuzz, process
from sklearn.metrics import pairwise_distances

In [2]:
# Import processed data from previous step
df = pickle.load(open("../data/dataframe.p", "rb"))
ingredients = pickle.load(open("../data/ingredients.p", "rb"))

In [3]:
df.head()

Unnamed: 0,strDrink,dateModified,idDrink,strAlcoholic,strCategory,strDrinkThumb,strGlass,strIBA,strIngredient1,strIngredient10,...,strMeasure15,strMeasure2,strMeasure3,strMeasure4,strMeasure5,strMeasure6,strMeasure7,strMeasure8,strMeasure9,strVideo
0,'57 Chevy with a White License Plate,2016-07-18 22:49:04,14029,alcoholic,Cocktail,http://www.thecocktaildb.com/images/media/drin...,highball glass,,Creme de Cacao,,...,,1 oz,,,,,,,,
1,1-900-FUK-MEUP,2016-07-18 22:27:04,15395,alcoholic,Shot,http://www.thecocktaildb.com/images/media/drin...,old-fashioned glass,,Absolut Kurant,,...,,1/4 oz,1/4 oz,1/4 oz,1/4 oz,1/4 oz,1/2 oz,1/4 oz,,
2,110 in the shade,2016-02-03 14:51:57,15423,alcoholic,Beer,http://www.thecocktaildb.com/images/media/drin...,beer glass,,Lager,,...,,1.5 oz,,,,,,,,
3,151 Florida Bushwacker,2016-07-18 22:28:43,14588,alcoholic,Milk / Float / Shake,http://www.thecocktaildb.com/images/media/drin...,beer mug,,Malibu rum,,...,,1/2 oz,1/2 oz Bacardi,1 oz,1 oz,3 oz,1 oz,1 cup,,
4,155 Belmont,2016-10-05 12:36:28,15346,alcoholic,Cocktail,http://www.thecocktaildb.com/images/media/drin...,white wine glass,,Dark rum,,...,,2 shots,1 shot,1 shot,,,,,,


In [4]:
ingredients.head()

Unnamed: 0_level_0,151 proof rum,7-up,absinthe,absolut citron,absolut kurant,absolut peppar,absolut vodka,advocaat,agave syrup,ale,...,white creme de menthe,white rum,wild turkey,wine,worcestershire sauce,wormwood,yellow chartreuse,yoghurt,yukon jack,zima
idDrink,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14029,,,,,,,,,,,...,,,,,,,,,,
15395,,,,,1.0,,,,,,...,,,,,,,,,,
15423,,,,,,,,,,,...,,,,,,,,,,
14588,1.0,,,,,,,,,,...,,,,,,,,,,
15346,,,,,,,,,,,...,,,,,,,,,,


## Recommendations based on title

This system is a good starting point for recommending cocktails to users based on the their selection. It doesn't require any historical data beyond the one recipe. This type of system is content-based, becuase it depends only on the content (cocktail title) not on user histories.

The approach compares all the cocktails in the database to the one input from the user. It returns the cocktails with the most similar names.

The measure of similarity here is Levenshtein distance, which is a fast metric to compute. It requires only the string titles and measures the difference between two sequences. Fewer differences between sequences means lower distance.

In [5]:
query = 'Gin and Tonic'
choices = df['strDrink'].values.tolist()

In [6]:
matches = process.extract(query, choices, limit=6, scorer=fuzz.token_set_ratio)
print(matches)

[('Gin And Tonic', 100), ('Vodka And Tonic', 82), ('Addington', 64), ('London Town', 58), ('Gin Daisy', 55), ('Gin Fizz', 55)]


In [7]:
choices = matches[1:]
choices_names = [val[0] for val in choices]
choices_names

['Vodka And Tonic', 'Addington', 'London Town', 'Gin Daisy', 'Gin Fizz']

In [8]:
print(f'If you like {query}, you should try these drinks:')
for val in choices_names:
    print(f'{val}')

If you like Gin and Tonic, you should try these drinks:
Vodka And Tonic
Addington
London Town
Gin Daisy
Gin Fizz


## Recommendations based on ingredients

This system is an upgrade for recommending cocktails to users based on the their selection. It doesn't require any historical data beyond the one recipe. This type of system is content-based, becuase it depends only on the content (cocktail ingredients) not on user histories.

The approach compares all the cocktails in the database to the one input from the user. It returns the cocktails with the most similar ingredients.

The measure of similarity here is cosine similarity, which is a fast metric to compute. It requires only the binary ingredients and measures the angles between recipe vectors. Smaller angles between vectors means higher similarity.

In [9]:
query = 'Gin and Tonic'
choices = df['strDrink'].values.tolist()

In [10]:
matches = process.extract(query, choices, limit=6, scorer=fuzz.token_set_ratio)
print(matches)

[('Gin And Tonic', 100), ('Vodka And Tonic', 82), ('Addington', 64), ('London Town', 58), ('Gin Daisy', 55), ('Gin Fizz', 55)]


In [11]:
top_choice = matches[0]
top_choice_idx = choices.index(top_choice[0])
top_choice_row = df.iloc[top_choice_idx, :]
cocktail_id = top_choice_row['idDrink']
cocktail_id

11403

In [12]:
def cos_sim_matrix(M):
    "Create cosine similarity matrix"
    
    sims = 1 - pairwise_distances(M.values, metric='cosine')
    sim_matrix = pd.DataFrame(sims, index=M.index, columns=M.index).fillna(0)
    return sim_matrix

In [13]:
ingredients.fillna(0, inplace=True)

In [14]:
cos_scores = cos_sim_matrix(ingredients.select_dtypes(exclude=['O']))
cos_scores.shape

(546, 546)

In [15]:
def get_similar_cocktails(cocktail_id, top_n, cscores, cocktail_data):
    """ Return best matches to a cocktail based on cosine similarity
    
        cocktail_id: numeric cocktail id
        top_n: number of best matches to return
        cscores: cosine similarity matrix for cocktails as dataframe with cocktail id's
        recipe_data: the dataframe of all available cocktails with ingredients
    """
    top_cocktail_scores = cscores.drop(cocktail_id, axis=1).loc[cocktail_id, :].nlargest(top_n)  
    similar_cocktail_ids = top_cocktail_scores.index.values
    similar_cocktails = cocktail_data[cocktail_data.idDrink.isin(similar_cocktail_ids)]    
    last_cocktail = cocktail_data[cocktail_data.idDrink == cocktail_id]
    
    return top_cocktail_scores, similar_cocktails

In [16]:
# Take a random cocktail, instead of user input
# cocktail_id = cos_scores.sample(1).index.values[0]

In [17]:
scores, cocktails = get_similar_cocktails(cocktail_id, 5, cos_scores, df)

In [18]:
last_cocktail = df[df.idDrink == cocktail_id]
last_cocktail_name = last_cocktail['strDrink'].values[0]

print(f"If you like {last_cocktail_name}, you should try these drinks:")
for val in cocktails['strDrink']:
    print(f"{val}")

If you like Gin And Tonic, you should try these drinks:
AT&T
Archbishop
Dragonfly
Gin Rickey
Long vodka
