In [1]:
# import pandas
import pandas as pd

# import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# import data
df = pd.read_csv("https://raw.githubusercontent.com/nikitaa30/Content-based-Recommender-System/master/sample-data.csv")

In [3]:
df

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...
3,4,"Alpine guide pants - Skin in, climb ice, switc..."
4,5,"Alpine wind jkt - On high ridges, steep ice an..."
...,...,...
495,496,Cap 2 bottoms - Cut loose from the maddening c...
496,497,Cap 2 crew - This crew takes the edge off fick...
497,498,All-time shell - No need to use that morning T...
498,499,All-wear cargo shorts - All-Wear Cargo Shorts ...


* explore DataFrame

it looks like just the id column and "clothing title name - description" in description.
So we could split off the title of the apparel before the hyphen - it looks like it should be formatted that way

However, let's keep it for now because one column is used to scan the item description and the title of the garment is very useful for similarity

In [4]:
df.head(3)

Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...


We will be using Tf-Idf to find similar items based on description
* instantiate TF-IDF

In [5]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

* fit and transform 'description' column with TFIDF

In [7]:
#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['description'])

In [8]:
tfidf_matrix.shape

(500, 4600)

* calculate the cosine similarity of each item with every other item in the dataset, 

In [9]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [11]:
cosine_sim.shape # 500 items each with a similarity score to every other item, checks out

(500, 500)

In [13]:
cosine_sim[1] # looks good, array of 0 -> 1 similarity scores

array([0.31005145, 1.        , 0.57514356, 0.18551749, 0.16517315,
       0.13065451, 0.14784825, 0.10154623, 0.10983601, 0.08954645,
       0.18021133, 0.16158732, 0.10784675, 0.09073733, 0.18599643,
       0.19864233, 0.08458325, 0.23177543, 0.30147807, 0.15001722,
       0.14583559, 0.16563885, 0.17639399, 0.137909  , 0.18356507,
       0.12907323, 0.10623488, 0.10658333, 0.15266359, 0.15834916,
       0.09514234, 0.16008902, 0.0964505 , 0.18607527, 0.10941243,
       0.10407664, 0.13703602, 0.10710244, 0.10767107, 0.14878977,
       0.11312067, 0.06683643, 0.15743388, 0.16590553, 0.14903443,
       0.16836947, 0.11368272, 0.04495455, 0.06978153, 0.08880231,
       0.08168903, 0.15055182, 0.1381053 , 0.09895573, 0.15137049,
       0.11744756, 0.12512088, 0.17672326, 0.1824234 , 0.12104068,
       0.1793333 , 0.15213401, 0.17384013, 0.17302115, 0.17062298,
       0.10639813, 0.07123265, 0.10254366, 0.0478789 , 0.16268414,
       0.11443416, 0.11057233, 0.13267991, 0.09166135, 0.18177

* sort all items using their similarity for each item i, and store the values in dictionary `results`

```
results = {
    "1": [5,7,9...],
    "2": [45,2,3...]
}
```

In [21]:
import numpy as np

# Create an empty dictionary to store the results
results = {}

# Iterate through each item (row) in the cosine similarity matrix
for i in range(cosine_sim.shape[0]): # Loop through each row, range is the number of rows
    # Get the similarities for the current item (row)
    similarity_scores = cosine_sim[i]
    
    # Sort the indices of the similarities in descending order
    sorted_indices = np.argsort(-similarity_scores)  # Negative sign for descending order
    
    # Store the sorted indices (other item IDs) in the dictionary
    results[str(i + 1)] = sorted_indices.tolist()

    # exclude the first item, which is the item itself
    results[str(i + 1)] = results[str(i + 1)][1:]

# Now `results` contains the sorted items for each item


In [25]:
# test the results dictionary by printing the first 5 items
for key, value in results.items():
    print(key, value)
    if int(key) > 3:
        break


1 [18, 493, 494, 364, 441, 495, 171, 17, 1, 24, 60, 439, 170, 412, 496, 175, 339, 440, 20, 23, 486, 173, 442, 172, 358, 19, 59, 487, 340, 359, 328, 22, 21, 174, 449, 405, 91, 179, 146, 45, 61, 265, 99, 324, 327, 63, 499, 329, 311, 418, 10, 334, 276, 391, 109, 302, 29, 11, 390, 357, 445, 2, 28, 206, 204, 213, 208, 498, 118, 145, 459, 117, 87, 349, 283, 323, 158, 427, 258, 259, 264, 330, 200, 482, 199, 54, 275, 104, 57, 3, 481, 88, 235, 480, 36, 86, 465, 271, 394, 44, 325, 62, 401, 305, 406, 201, 89, 42, 33, 169, 384, 64, 287, 365, 432, 156, 408, 320, 69, 56, 96, 39, 431, 415, 437, 286, 159, 186, 111, 187, 203, 222, 183, 272, 167, 228, 105, 268, 333, 304, 300, 269, 453, 399, 103, 212, 226, 137, 163, 177, 411, 192, 116, 8, 346, 308, 317, 426, 185, 460, 97, 479, 127, 416, 387, 15, 230, 301, 471, 462, 207, 450, 461, 31, 194, 72, 293, 223, 74, 335, 403, 75, 160, 373, 452, 484, 347, 307, 124, 233, 444, 298, 497, 4, 263, 239, 229, 380, 261, 119, 457, 402, 247, 483, 303, 202, 282, 14, 414, 254,

In [17]:
# First construct a reverse mapping of movie titles and DataFrame indices
# Construct a reverse map of indices and movie titles
indices = pd.Series(df.index, index=df['description']).drop_duplicates()

In [18]:
indices[:10]

description
Active classic boxers - There's a reason why our boxers are a cult favorite - they keep their cool, especially in sticky situations. The quick-drying, lightweight underwear takes up minimal space in a travel pack. An exposed, brushed waistband offers next-to-skin softness, five-panel construction with a traditional boxer back for a classic fit, and a functional fly. Made of 3.7-oz 100% recycled polyester with moisture-wicking performance. Inseam (size M) is 4 1/2". Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>"Silky Capilene 1 fabric is ultralight, breathable and quick-to-dry"</li> <li>"Exposed, brushed elastic waistband for comfort"</li> <li>5-panel construction with traditional boxer back</li> <li>"Inseam (size M) is 4 1/2"""</li></ul><br><br><b>Fabric: </b>3.7-oz 100% all-recycled polyester with Gladiodor natural odor control for the garment. Recyclable through the Common Threads Recycling Program<br><br><b>Weight: </b>99 g (3.5

* create function `recommender` that will recommend similar products. function must have two input params: 
    - **item_id** - we can obtain this from the description, but we'll require it to specify what you want similarities to
    - **count** of similar products 

sort all items using their similarity score in the cosine_sim matrix. This will return a list of indices of the items in descending order of similarity score. This is done by using the enumerate function to keep track of the indices of the items

In [28]:
# Function that takes in movie title as input and outputs most similar movies
def recommender(id, count, cosine_sim=cosine_sim):
    # Get the pairwsie similarity scores of all items with the item id you gave
    sim_scores = list(enumerate(cosine_sim[id]))

    # Sort the items based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the descriptions of the number of smilarities the user asked for
    sim_scores = sim_scores[:count]

    # Get the item indices
    item_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return df['description'].iloc[item_indices]

In [30]:
# test out
recommender(10, 5)

10     Baby sunshade top - Soft, stretchy polyester f...
418    Sunshade hoody - Put an end to the sunscreen s...
464    Baby baggies apron dress - This lively dress k...
411    Sunshade shirt - This ultralight, moisture wic...
403    Hooded monk sweatshirt - The sacred garment of...
Name: description, dtype: object

* show top 5 the most similar items for item with item_id = 11

In [31]:
recommender(11, 5) # must be shorts

11     Baggies shorts - Even Baggies, our most popula...
401    River shorts - River life is mighty fine, but ...
407    Baggies shorts - Summertime unwinds in a boist...
427    Girl's baggies shorts - An everyday staple for...
311    Baggies shorts - A loyal partner in grime, Bag...
Name: description, dtype: object

In [33]:
df['description'].iloc[11]

'Baggies shorts - Even Baggies, our most popular shorts for anything, or nothing, occasionally need an update. This season we\'ve increased the inseam length. Their casual fit, quick-drying water-repellent nylon and elasticized waistband with an internal drawstring remain the same as ever. Other features include a polyester mesh lining, a rear snap pocket and front pockets (with self-draining mesh corners) positioned to reduce drag in the water. Inseam (size M) is 7". Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>Quick-drying nylon with a DWR (durable water repellent) finish</li> <li>Elasticized waistband with internal drawstring; black mesh liner</li> <li>Vertical on-seam side pockets for reduced drag in the water; pocket bags have quick-drain-and-dry mesh corners; snap-closed back pocket</li> <li>"Inseam is 7"""</li></ul><br><br><b>Fabric: </b>4.2-oz 100% nylon with a DWR finish. Lining: 5.2-oz 100% polyester mesh. Recyclable through the Comm