# Cosine Similarity

The final decision of our recommender system: In this notebook, we will select which features of the coffee to compare, transform them, and implement different recommender systems.

In [1]:
import pandas as pd
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np

In [300]:
coffee_df = pd.read_csv('../data/coffee_clean.csv')
name_df = pd.read_csv('../data/coffee_id.csv')

In [345]:
def set_filter(input_slug, filter_on = None, coffees = coffee_df, names = names_df, ):
    try: 
        filtered_coffees = coffees[coffees[filter_on] == 1]
        filtered_names = names_df.merge(filtered_coffees,on='slug',right_index=True)[names_df.columns]
        filtered_slugs = coffees[coffees[filter_on] == 1]['slug']
        
        if input_slug not in list(filtered_slugs):
            filtered_coffees = filtered_coffees.append(coffees[coffees['slug'] == input_slug]).sort_index()
            filtered_names = filtered_names.append(names_df[names_df['slug'] == input_slug]).sort_index()
            filtered_slugs = filtered_slugs.append(coffees[coffees['slug'] == input_slug]['slug']).sort_index()
        
        return (filtered_coffees, filtered_names, filtered_slugs)

    except (KeyError, TypeError) as e:
        print(f"Sorry {e} is not a valid filter")
        pass

## Feature-based Recommender

In [346]:
def features_rec(coffees,slugs):

    ss = StandardScaler()
    ss_fitted = ss.fit_transform(coffees)
    
    #calculate cosine similarities and create dataframe of all similarities
    features_recommender = pairwise_distances(ss_fitted, metric='cosine')
    features_recommender_df = pd.DataFrame(features_recommender, index = slugs, columns = slugs)
    return (features_recommender_df, ss_fitted)

## Text/Description-Based Recommender

Latent Semantic Analysis:
   - Create TFIDF vectors of text data
   - Reduce dimensionality using TruncatedSVD 
   - Create a recommender based on their cosine similarity

In [348]:
def text_rec(coffees,slugs):

    tfidf = TfidfVectorizer(min_df=2, ngram_range=(2,4),max_features=10000)
    tfidf_fitted = tfidf.fit_transform(coffees)

    #TruncatedSVD transformation, number of components
    tsvd = TruncatedSVD(n_components=225,random_state=36)
    tsvd_fitted = tsvd.fit_transform(tfidf_fitted)

    #calculate cosine similarities and create dataframe of all similarities
    text_recommender = pairwise_distances(tsvd_fitted, metric='cosine')


    text_recommender_df = pd.DataFrame(text_recommender, index = slugs, columns = slugs)
    
    return (text_recommender_df, tsvd_fitted)

## Combination Recommender

We have 225 components from our text data and 25 categorical and numerical features.  
Our combination recommender's similarity is based 90% of text, and 10% of categorical and numerical features.

### Function to print recommendations:

In [349]:
def get_recommendations(input_slug, coffees, names, features,
                        pick_best = None, n_nearest = 10,
                        how = 'combination', filter_on = None):
    '''
    Prints coffee recommendation.

    input_slug: {string} Slug of coffee to make comparisons with
    rec_df: DataFrame of recommendations with cosine similarities
    names_df: DataFrame of coffee slugs, name, and roaster
    pick_best: {boolean} Picks the highest rated coffee of the 'n_nearest' most similar coffees.
    n_nearest: {int} Number of coffees to compare the coffees to (when pick_best = True)
    how: {string} 'text','features', or 'combination'
    set_filter: {string}, name of region, type, or roast
    '''
    
    input_name = names[names['slug'] == input_slug]['name'].to_string(index = False)
    input_roaster = names[names['slug'] == input_slug]['roaster'].to_string(index = False)
    slugs = names['slug']
    recommender = None
    
    if filter_on != None:
        coffees, names, slugs = set_filter(input_slug, filter_on = filter_on, coffees = coffees, names = names)

    if how == 'features':
        print("*Choosing recommendation based on ratings, roast, and type*")
        recommender = features_rec(coffees[features],slugs)[0]
    elif how == 'text':
        print("*Choosing recommendation based on text descriptions*")
        recommender = text_rec(coffees['clean_text'],slugs)[0]
    elif how == 'combination':
        print("*Choosing recommendation based on everything*")
        joined = np.concatenate((features_rec(coffees[features],slugs)[1],
                                 text_rec(coffees['clean_text'],slugs)[1],), axis=1)
        distances = pairwise_distances(joined, metric='cosine')
        recommender = pd.DataFrame(distances, index = slugs, columns = slugs)

    sims = names.join(recommender[input_slug], how='outer', on='slug')
    sorted_sims = sims.drop(sims[sims['slug'] == input_slug].index).sort_values(by = input_slug)
    
    if pick_best:
        print("*Recommending the highest rated coffee out of the", n_nearest, "most similar coffees*")
        recs = sorted_sims[0:n_nearest].sort_values(by='rating', ascending=False).iloc[0]
    else:
        print("*Recommending the most similar coffee*")
        recs = sorted_sims.iloc[0]
    
    print("If you like " + input_name + " by " + input_roaster +
         ", you might also like " + recs['name'] + " by " + recs['roaster'] + ".")
    print("\nCompare for yourself:\n",
         "https://www.coffeereview.com/review/" + input_slug,
         "\n https://www.coffeereview.com/review/" + recs['slug'])
    print("\nCosine Similarity: ", round(recs.loc[input_slug],3))

In [350]:
regions = ['region_africa_arabia', 'region_caribbean', 'region_central_america', 
           'region_hawaii', 'region_asia_pacific', 'region_south_america']
types = ['type_espresso', 'type_organic', 'type_fair_trade', 
         'type_decaffeinated', 'type_pod_capsule', 'type_blend', 'type_estate']
roasts = ['roast_dark', 'roast_light', 'roast_medium', 'roast_medium_dark',
       'roast_medium_light', 'roast_very_dark', 'roast_nan']
features = ['aroma','acid_or_milk','body','flavor','type_with_milk'] + roasts + types + regions

In [354]:
test_slug = np.random.choice(df['slug'])

In [358]:
n = 5
best_rating = True

get_recommendations(test_slug, coffee_df, name_df, features, filter_on = 'roast_medium_light',
                    pick_best = best_rating, n_nearest = n, how = 'text')

*Choosing recommendation based on text descriptions*
*Recommending the highest rated coffee out of the 5 most similar coffees*
If you like Costa Coast Blend by Roast House, you might also like Flight Seasonal Espresso by Red Rooster Coffee Roaster.

Compare for yourself:
 https://www.coffeereview.com/review/costa-coast-blend 
 https://www.coffeereview.com/review/flight-seasonal-espresso-2

Cosine Similarity:  0.534
