# Project : Myntra Products Recommendation

In [1]:
# Import Necessary Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("dataset.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1060213 entries, 0 to 1060212
Data columns (total 11 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   id           1060213 non-null  int64  
 1   name         1060213 non-null  object 
 2   img          1060213 non-null  object 
 3   asin         1060213 non-null  object 
 4   price        1060213 non-null  float64
 5   mrp          1060213 non-null  float64
 6   rating       1060213 non-null  float64
 7   ratingTotal  1060213 non-null  int64  
 8   discount     1060213 non-null  int64  
 9   seller       1060213 non-null  object 
 10  purl         1060213 non-null  object 
dtypes: float64(3), int64(3), object(5)
memory usage: 89.0+ MB


In [4]:
df.head()

Unnamed: 0,id,name,img,asin,price,mrp,rating,ratingTotal,discount,seller,purl
0,1,Men Solid Oversized Cotton,"https://assets.myntassets.com/f_webp,dpr_1.0,q...",-,532.0,1299.0,4.1,5300,59,Difference of Opinion,https://www.myntra.com/tshirts/difference-of-o...
1,2,Men Cotton Pure Cotton T-shirt,"https://assets.myntassets.com/f_webp,dpr_1.0,q...",-,274.0,499.0,4.2,25400,45,Roadster,https://www.myntra.com/tshirts/roadster/roadst...
2,3,Women Pure Cotton T-shirt,"https://assets.myntassets.com/f_webp,dpr_1.0,q...",-,551.0,1199.0,4.5,3400,54,DILLINGER,https://www.myntra.com/tshirts/dillinger/dilli...
3,4,Typography Print T-shirt,"https://assets.myntassets.com/f_webp,dpr_1.0,q...",-,296.0,1099.0,4.0,18400,73,Huetrap,https://www.myntra.com/tshirts/huetrap/huetrap...
4,5,Printed Round Neck Pure Cotton T-shirt,"https://assets.myntassets.com/f_webp,dpr_1.0,q...",-,494.0,899.0,4.2,3800,45,Roadster,https://www.myntra.com/tshirts/roadster/roadst...


In [6]:
df.duplicated().sum()

0

## Popular products

In [69]:
data_unique.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 217738 entries, 0 to 1060204
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           217738 non-null  int64  
 1   name         217738 non-null  object 
 2   img          217738 non-null  object 
 3   asin         217738 non-null  object 
 4   price        217738 non-null  float64
 5   mrp          217738 non-null  float64
 6   rating       217738 non-null  float64
 7   ratingTotal  217738 non-null  int64  
 8   discount     217738 non-null  int64  
 9   seller       217738 non-null  object 
 10  purl         217738 non-null  object 
dtypes: float64(3), int64(3), object(5)
memory usage: 19.9+ MB


In [72]:
data_unique.sort_values(by="ratingTotal",ascending=False).head(10)[["name","price","rating","discount"]].reset_index().drop("index",axis=1)

Unnamed: 0,name,price,rating,discount
0,toner-lotion-cream-lip balm,695.0,4.4,35
1,toner-sunscreen-lotion-cream,719.0,4.4,34
2,sustainable skin & hair care,926.0,4.4,38
3,men pack of 2 printed tshirts,1598.0,4.2,0
4,serum-toner-scrub-cream,636.0,4.4,35
5,bio complete skin care kit,501.0,4.4,38
6,toner-lotion-eye gel-cream,1030.0,4.4,0
7,lipstick with mascara & liner,1100.0,4.4,30
8,set of toner & scrub & cream,428.0,4.4,35
9,skincare daily routine combo,483.0,4.4,38


## Recommendation using Product-Product Similarity


In [7]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [8]:
data_unique = df.drop_duplicates(subset='name', keep='first')

In [9]:
data_unique.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 217738 entries, 0 to 1060204
Data columns (total 11 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   id           217738 non-null  int64  
 1   name         217738 non-null  object 
 2   img          217738 non-null  object 
 3   asin         217738 non-null  object 
 4   price        217738 non-null  float64
 5   mrp          217738 non-null  float64
 6   rating       217738 non-null  float64
 7   ratingTotal  217738 non-null  int64  
 8   discount     217738 non-null  int64  
 9   seller       217738 non-null  object 
 10  purl         217738 non-null  object 
dtypes: float64(3), int64(3), object(5)
memory usage: 19.9+ MB


In [143]:
data_unique.reset_index(inplace=True)

In [147]:
data_unique.drop(["level_0","index"],axis=1,inplace=True)

In [148]:
len(data_unique.name.unique())

211340

In [149]:
data_unique["name"] = data_unique["name"].apply(str.lower)

In [150]:
from sklearn.metrics.pairwise import cosine_similarity

data = data_unique.head(10000)

In [126]:
def compute_similarity_matrix(scaled_df):
    
    similarity_matrix = pd.DataFrame(
        cosine_similarity(scaled_df[['price', 'rating', 'discount']]),
        index=scaled_df['name'],
        columns=scaled_df['name']
    )
    return similarity_matrix

In [127]:
product_similarity_df = compute_similarity_matrix(data)

In [128]:
def get_similar_products(product_name, similarity_matrix, data, top_n=5):
    product_name = str.lower(product_name)
    if product_name not in similarity_matrix.columns:
        return f"Product '{product_name}' not found in the similarity matrix."

    similar_scores = similarity_matrix[product_name]
    similar_products = similar_scores.sort_values(ascending=False).head(top_n + 1)
    similar_products = similar_products.iloc[1:]
    recommendations = data[data['name'].isin(similar_products.index)][[ "name", "price", "rating", "discount"]]
    
    return recommendations.reset_index().drop("index",axis=1)

In [129]:
recommended_products = get_similar_products("women gold-plated necklace set", product_similarity_df,data, top_n=5)
print(recommended_products)

                                                name   price  rating  discount
0  men self-designed slim-fit single breasted ban...  2199.0     3.0        60
1       24k gold-plated stones-studded jewellery set  2550.0     3.8        70
2  24k gold-plated pearl beaded handcrafted filig...  2380.0     4.2        65
3                 men embroidered kurta with patiala  2199.0     4.5        60
4  rhodium-plated american diamonds-studded handc...  2609.0     4.3        71


In [131]:
data.to_csv("data.csv",index=False)

# Recommendation System Using TFIDF Method

In [132]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [133]:
def compute_similarity(data):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(data['name'])
    cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
    return cosine_sim

In [134]:
product_similarity = compute_similarity(data)

In [155]:
def recommendations(product_name):
    if product_name not in data['name'].values:
        return jsonify({'error': 'Product not found'}), 404

    # Get index of product
    idx = data.index[data['name'] == product_name].tolist()[0]

    # Get similarity scores
    sim_scores = list(enumerate(product_similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]

    # Get product indices and return details
    product_indices = [i[0] for i in sim_scores]
    recommendations = data.iloc[product_indices][[ 'name', 'price', 'rating', 'discount']]

    return recommendations.reset_index().drop("index",axis=1) 

In [158]:
recommendation = recommendations("women gold-plated necklace set")
print(recommendation)

                            name   price  rating  discount
0     women gold-plated necklace   934.0     3.2        83
1       gold-plated necklace set  2240.0     4.5        68
2  set of 2 gold-plated necklace   292.0     4.1        78
3           gold-plated necklace   339.0     0.0        80
4   24k gold plated necklace set  3000.0     4.0        75
