# Product Recommendation Engine

## Step 1 - Train the engine.


## Import Libraries

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

## Examine Data

In [16]:
ds = pd.read_csv("../input/sample-data.csv")
ds.tail()

Unnamed: 0,id,description
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...
499,500,All-wear shorts - Time to simplify? Our All-We...


## TF-IDF Matrix

In [20]:
#Create a TF-IDF matrix of unigrams, bigrams, and trigrams for each product. 
tf = TfidfVectorizer(
    analyzer='word', 
    ngram_range=(1, 3), 
    min_df=0, 
    stop_words='english'#The 'stop_words' param tells the TF-IDF module to ignore common english words like 'the', etc.
)

tfidf_matrix = tf.fit_transform(
    ds['description']
)

In [None]:
# Then we compute similarity between all products 
# using SciKit Leanr's linear_kernel (which in this case 
# is equivalent to cosine similarity).
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
results = {}
# Iterate through each item's similar items and store the 100 most-similar. 
# Stops at 100 because well...how many similar products do you really need to show?
for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]
    # Similarities and their scores are stored in a dict as a list of Tuples, 
    # indexed to their item id.
    # First item is the item itself, so remove it.
    # Each dictionary entry is like: [(1,2), (3,4)], with each tuple being (score, item_id)
    results[row['id']] = similar_items[1:]

## Step 2: Predict!

In [18]:
# hacky little function to get a friendly item name from the description field, given an item ID
def item(id):
    return ds.loc[ds['id'] == id]['description'].tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary. No real logic here.
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

# Just plug in any item id here (1-500), and the number of recommendations you want (1-99)
# You can get a list of valid item IDs by evaluating the variable 'ds', or a few are listed below

recommend(item_id=255, num=3)

Recommending 3 products similar to Solid sunamee btm...
-------
Recommended: Print sunamee btm (score:0.9691357121396879)
Recommended: Print bayonne btm (score:0.4943344107191859)
Recommended: Solid adour btm (score:0.48881326636069045)
