In [1]:
import pandas as pd
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [23]:
df = pd.read_csv('datasets/comp_data3.csv')
df.head()

Unnamed: 0,all_info_combined,all_info_combined_clean,brand,category,description,name,price,store
0,"Organic Valley, Organic Half & Half, Ultra Pas...","organic valley, organic half & half, ultra pas...",Organic Valley,Dairy,When cows spend their days on lush green pastu...,"Organic Valley, Organic Half & Half, Ultra Pas...",4.69,Whole Foods Market
1,"Organic Valley, Organic Half & Half, Ultra Pas...","organic valley, organic half & half, ultra pas...",Organic Valley,Dairy,When cows spend their days on lush green pastu...,"Organic Valley, Organic Half & Half, Ultra Pas...",2.69,Amazon
2,"Organic Valley, Organic Heavy Whipping Cream, ...","organic valley, organic heavy whipping cream, ...",Organic Valley,Dairy,When cows spend their days on lush green pastu...,"Organic Valley, Organic Heavy Whipping Cream, ...",4.49,Amazon
3,Folgers Gourmet Selections Lively Colombian Co...,folgers gourmet selections lively colombian co...,Folgers Gourmet Selections Lively Colombian Co...,Dairy,Folgers Gourmet Selections K-Cups - Med - Colu...,Folgers Gourmet Selections Lively Colombian Co...,9.06,Amazon
4,"Lactaid 2% Reduced Fat Milk, 64 fl oz You won’...","lactaid 2% reduced fat milk, 64 fl oz won’t be...",Lactaid,Dairy,You won’t believe it’s 100% lactose free. Get ...,"Lactaid 2% Reduced Fat Milk, 64 fl oz",3.99,Amazon


In [3]:
df.columns

Index(['all_info_combined', 'brand', 'category', 'description', 'name',
       'price', 'store'],
      dtype='object')

In [24]:
df.isna().sum()

all_info_combined             0
all_info_combined_clean       0
brand                         0
category                      0
description                2840
name                          0
price                       295
store                         0
dtype: int64

In [5]:
df['price'].dropna(how = 'any',inplace=True)
df['price'].isna().sum()    

0

In [88]:
tfidf = TfidfVectorizer(analyzer = 'word', tokenizer= None,
                      preprocessor = None, stop_words='english',
                      max_features=2000, ngram_range=(1,3))

In [89]:
df_features_tfidf = tfidf.fit_transform(df['all_info_combined_clean'])

In [90]:
features = pd.DataFrame(df_features_tfidf.todense(), columns=tfidf.get_feature_names())

In [91]:
features.shape

(13910, 2000)

In [92]:
df['name'].shape

(13910,)

In [93]:
df2 = pd.merge(df['name'],features, right_index=True, left_index=True)

In [94]:
df2.set_index(df2['name'],inplace=True)

In [95]:
df2.drop(columns=['name'],inplace=True)

In [96]:
df2.head()

Unnamed: 0_level_0,000,000 diet,0cal,0cal cholesterol,0cal cholesterol 0mg,0g,0g calcium,0g iron,0g iron protein,0g percentage,...,yogurt day,yogurt day healthy,yogurt provides,yogurt provides key,york,yummy,zero,zero calories,zero sugar,zevia
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"Organic Valley, Organic Half & Half, Ultra Pasteurized, Quart, 32 oz",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Organic Valley, Organic Half & Half, Ultra Pasteurized, Pint, 16 oz",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Organic Valley, Organic Heavy Whipping Cream, Ultra Pasteurized, Pint, 16 oz",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Folgers Gourmet Selections Lively Colombian Coffee, K-Cup for Keurig Brewers, 12 Count, Packaging May Vary",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Lactaid 2% Reduced Fat Milk, 64 fl oz",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
sparse_pivot = sparse.csr_matrix(df2.fillna(0))


In [98]:
recommender = pairwise_distances(sparse_pivot,metric='cosine')

In [102]:
recommender_df[["Stacy's Simply Naked Bagel Chips, 7 oz Bag"]]

name,"Stacy's Simply Naked Bagel Chips, 7 oz Bag"
name,Unnamed: 1_level_1
"Organic Valley, Organic Half & Half, Ultra Pasteurized, Quart, 32 oz",0.997889
"Organic Valley, Organic Half & Half, Ultra Pasteurized, Pint, 16 oz",0.997883
"Organic Valley, Organic Heavy Whipping Cream, Ultra Pasteurized, Pint, 16 oz",0.996139
"Folgers Gourmet Selections Lively Colombian Coffee, K-Cup for Keurig Brewers, 12 Count, Packaging May Vary",1.000000
"Lactaid 2% Reduced Fat Milk, 64 fl oz",0.992296
"Folgers French Roast Ground Coffee, Medium-Dark Roast, 24.2 Ounce, Packaging May Vary",1.000000
"365 Everyday Value Coffee Creamer Vanilla, 16 fl oz",0.992847
"365 Everyday Value, Real Dairy Whipped Cream, 7 oz",0.989129
DairyPure 2% Reduced Fat Milk - Half Gallon,0.988582
"365 Everyday Value, Organic Whole Milk, 128 oz",0.995920


In [99]:
recommender_df = pd.DataFrame(recommender, columns=df2.index, index=df2.index)

In [106]:
df.loc[df['name'].str.contains('Bolthouse'), 'name']

10762    Bolthouse Farms Classic Ranch Yogurt Dressing ...
10763    Bolthouse Farms Avocado Green Goddess Yogurt D...
10764        Bolthouse Organic Avocado Ranch Dressing-12oz
10767           Bolthouse Cilantro Avocado Dressing - 14oz
10768    Bolthouse Farms Classic Balsamic Extra Virgin ...
10769    Bolthouse Farms Organic Lemon Basil Vinaigrett...
10776    Bolthouse Farms Salsa Ranch Yogurt Dressing - ...
10785    Bolthouse Organic Raspberry Balsamic Vinaigret...
10803    Bolthouse Farms Green Giant Carrots Baby Supre...
10823             Bolthouse Farms Carrot Sticks - 12oz Bag
10832       Bolthouse Farms Organic Cello Carrot - 1lb Bag
10971                Bolthouse Farms Green Goodness - 52oz
10984                  Bolthouse Farms Carrot Juice - 52oz
10990    Bolthouse Farms Perfectly Protein Vanilla Chai...
11001    Bolthouse Farms B Strong Protein Coffee Shake ...
11002    Bolthouse Farms B Strong Protein Vanilla Shake...
11005    Bolthouse Farms B Balanced Berry Smoothie - 15.

In [107]:
search = "Bolthouse Farms Green Goodness - 52oz"

for title in df.loc[df['name'].str.contains(search), 'name']:
    print(title)
#     print(recommender_df[title].sort_values()[1:10])
    recommendation = recommender_df[title].sort_values()[1:10]
    print('')
    print('*******************************************************************************************')
    print('')
recom = pd.DataFrame(recommendation)    
recoms = pd.merge(df, recom, on = 'name')
recommended_cat = df.loc[df['name'].str.contains(search),'category'].head(1).values[0]
final = recoms[recoms['category']== recommended_cat]
final[['name','brand', 'store','price','category']]

Bolthouse Farms Green Goodness - 52oz

*******************************************************************************************



Unnamed: 0,name,brand,store,price,category
3,Naked Green Machine Boosted Juice Smoothie - 64oz,Naked,Target,7.39,Beverages
7,Simply Orange Pulp Free with Calcium and Vitam...,Simply Beverages,Target,4.49,Beverages
8,Simply Orange Medium Pulp Juice - 52 fl oz,Simply Beverages,Target,4.49,Beverages


In [108]:
recommendation

name
Birds Eye Steamfresh Selects Frozen Mixed Vegetables - 12oz            0.256164
Baby Cut Carrots - 1lb Bag                                             0.262126
Ready Pac Foods Spring Mix - 5oz                                       0.279460
Tomato Condensed Soup - 10.75oz - Market Pantry™                       0.285203
Ready Pac Spinach Spring Mix - 5.25oz                                  0.285570
Simply Orange Pulp Free with Calcium and Vitamin D Juice - 52 fl oz    0.295741
Ore-Ida Shredded Hash Frozen Brown Potatoes - 30oz                     0.301673
Simply Orange Medium Pulp Juice - 52 fl oz                             0.304312
Naked Green Machine Boosted Juice Smoothie - 64oz                      0.306879
Name: Bolthouse Farms Green Goodness - 52oz, dtype: float64

In [None]:
recoms
recom = pd.merge(df, recommendation, on = 'name')


In [None]:
pd.DataFrame(recommendation)

In [None]:
df[df['name'] == ]

In [None]:
recommended_category

In [None]:
search ='Red Bull'
df.loc[df['name'].str.contains(search), 'name']

In [None]:
recommender_df['Red Bull Energy Drink - 4pk/12 fl oz Cans'].sort_values[0:10]