In [29]:
# import pandas
import pandas as pd

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [30]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/nikitaa30/Content-based-Recommender-System/master/sample-data.csv")

In [31]:
df

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."
...,...,...
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...


* explore DataFrame

We will be using Tf-Idf to find similar items based on description
* instantiate TF-IDF

In [32]:
tfidf = TfidfVectorizer()

* fit and transform 'description' column with TFIDF

In [33]:
tfidf_matrix = tfidf.fit_transform(df['description'])

In [34]:
tfidf_matrix.shape

(500, 4816)

* calculate the cosine similarity of each item with every other item in the dataset, 

In [35]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [36]:
cosine_sim.shape

(500, 500)

* sort all items using their similarity for each item i, and store the values in dictionary `results`

```
results = {
    "1": [5,7,9...],
    "2": [45,2,3...]
}
```

In [72]:
results = {}
for i in range(len(df)):
    score_series = pd.Series(cosine_sim[i]).sort_values(ascending=False)
    idx_list = list(score_series.index)
    for movie_id in df['id']:
        results[movie_id] = idx_list

* create function `recommender` that will recommend similar products
    * function must have two input params: **item_id** and **count** of similar products 

In [79]:
def recommender(item_id, count):
    return results[item_id][1:count]

* show top 5 the most similar items for item with idem_id = 11

In [80]:
recommender(item_id=11, count=5)

[498, 462, 461, 31]