In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from difflib import get_close_matches

In [4]:
train_data = pd.read_csv('Raw_data.csv')

In [6]:
train_data.columns

Index(['Uniq Id', 'Crawl Timestamp', 'Dataset Origin', 'Product Id',
       'Product Barcode', 'Product Company Type Source',
       'Product Brand Source', 'Product Brand Normalised Source',
       'Product Name Source', 'Match Rank', 'Match Score', 'Match Type',
       'Retailer', 'Product Category', 'Product Brand', 'Product Name',
       'Product Price', 'Sku', 'Upc', 'Product Url', 'Market',
       'Product Description', 'Product Currency',
       'Product Available Inventory', 'Product Image Url',
       'Product Model Number', 'Product Tags', 'Product Contents',
       'Product Rating', 'Product Reviews Count', 'Bsr', 'Joining Key'],
      dtype='object')

In [7]:
train_data = train_data[['Uniq Id','Product Id','Product Rating','Product Reviews Count','Product Category','Product Brand','Product Name','Product Image Url','Product Description','Product Tags']]

In [8]:
train_data.shape

(5000, 10)

In [9]:
train_data.head()

Unnamed: 0,Uniq Id,Product Id,Product Rating,Product Reviews Count,Product Category,Product Brand,Product Name,Product Image Url,Product Description,Product Tags
0,1705736792d82aa2f2d3caf1c07c53f4,2e17bf4acecdece67fc00f07ad62c910,,,Premium Beauty > Premium Makeup > Premium Nail...,OPI,"OPI Infinite Shine, Nail Lacquer Nail Polish, ...",https://i5.walmartimages.com/asr/0e1f4c51-c1a4...,,"OPI Infinite Shine, Nail Lacquer Nail Polish, ..."
1,95a9fe6f4810fcfc7ff244fd06784f11,076e5854a62dd283c253d6bae415af1f,,,Beauty > Hair Care > Hair Color > Auburn Hair ...,Nice'n Easy,"Nice n Easy Permanent Color, 111 Natural Mediu...",https://i5.walmartimages.com/asr/9c8e42e4-13a5...,Pack of 3 Pack of 3 for the UPC: 381519000201 ...,"Nice 'n Easy Permanent Color, 111 Natural Medi..."
2,8d4d0330178d3ed181b15a4102b287f2,8a4fe5d9c7a6ed26cc44d785a454b124,4.5,29221.0,Beauty > Hair Care > Hair Color > Permanent Ha...,Clairol,Clairol Nice N Easy Permanent Color 7/106A Nat...,https://i5.walmartimages.com/asr/e3a601c2-6a2b...,This Clairol Nice N Easy Permanent Color gives...,Clairol Nice 'N Easy Permanent Color 7/106A Na...
3,fddc4df45b35efd886794b261f730c51,03b5fb878a33eadff8b033419eab9669,,,Beauty > Makeup > Lip,Kokie Cosmetics,"Kokie Professional Matte Lipstick, Hot Berry, ...",https://i5.walmartimages.com/asr/25b4b467-bc61...,Calling all matte lip lovers! Indulge in our r...,"Kokie Professional Matte Lipstick, Hot Berry, ..."
4,0990cf89a59ca6a0460349a3e4f51d42,ce3d761e57d6ccad80619297b5b1bcbc,,131.0,Seasonal > Stock Up Essentials > Personal Care...,Gillette,"Gillette TRAC II Plus Razor Blade Refills, Fit...",https://i5.walmartimages.com/asr/1a2ebb06-cd01...,"In 1971, Gillette introduced the Trac II razor...","Gillette TRAC II Plus Razor Blade Refills, Fit..."


In [10]:
train_data.isnull().sum()

Uniq Id                     0
Product Id                  0
Product Rating           2806
Product Reviews Count    1654
Product Category           10
Product Brand              13
Product Name                0
Product Image Url           0
Product Description      1127
Product Tags                0
dtype: int64

In [11]:
train_data['Product Rating'].fillna(0, inplace=True)
train_data['Product Reviews Count'].fillna(0, inplace=True)
train_data['Product Category'].fillna('', inplace=True)
train_data['Product Brand'].fillna('', inplace=True)
train_data['Product Description'].fillna('', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Product Rating'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['Product Reviews Count'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we

In [12]:
train_data.duplicated().sum()

np.int64(0)

In [13]:
Column_names = {
    'Uniq Id': 'Id',
    'Product Id': 'ProdId',
    'Product Rating': 'Rating',
    'Product Reviews Count': 'ReviewCount',
    'Product Category': 'Category',
    'Product Brand': 'Brand',
    'Product Name': 'Name',
    'Product Image Url': 'Image',
    'Product Description': 'Description',
    'Product Tags': 'Tags'
}

In [14]:
train_data.rename(columns=Column_names, inplace=True)

In [15]:
train_data['Id'] = train_data['Id'].str.extract(r'(\d+)').astype(float)
train_data['ProdId'] = train_data['ProdId'].str.extract(r'(\d+)').astype(float)

In [16]:
train_data.columns

Index(['Id', 'ProdId', 'Rating', 'ReviewCount', 'Category', 'Brand', 'Name',
       'Image', 'Description', 'Tags'],
      dtype='object')

In [15]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')
def extract_tags(text):
    doc = nlp(text.lower())
    tags = [token.text for token in doc if token.text.isalnum() and token.text not in STOP_WORDS]
    return ' '.join(tags)
Column_text = ['Category','Brand','Name','Description','Tags']
for col in Column_text:
    train_data[col] = train_data[col].apply(extract_tags)

In [16]:
train_data['Tags'] = train_data[Column_text].apply(lambda x: ', '.join(x), axis=1)

In [None]:
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)

In [18]:
def Content_based_recommendations(train_data, item_name, top_n=5):
    tfidf_vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix_content = tfidf_vectorizer.fit_transform(train_data['Tags'])
    cosine_similarities_content = cosine_similarity(tfidf_matrix_content, tfidf_matrix_content)
    item_name = get_close_matches(item_name, train_data['Name'], n=1, cutoff=0.6)[0]
    item_index = train_data[train_data['Name'] == item_name].index[0]
    similar_items = list(enumerate(cosine_similarities_content[item_index]))
    similar_items = sorted(similar_items, key=lambda x: x[1], reverse=True)
    top_similar_items = similar_items[0:top_n]
    recommended_item_indices = [x[0] for x in top_similar_items]
    recommended_items_details = train_data.iloc[recommended_item_indices][['Name', 'ReviewCount', 'Brand', 'Image', 'Rating']]
    return recommended_items_details

In [None]:
def Collaborative_neighbours_based_recommendations(data,target_user_id, top_n=10):
    user_item_matrix = data.pivot_table(index='Id', columns='productid', values='Rating', aggfunc='mean').fillna(0)
    if target_user_id not in user_item_matrix.index:
        raise ValueError(f"User with Id {target_user_id} not found.")
    user_item_sparse = csr_matrix(user_item_matrix.values)
    model.fit(user_item_sparse)
    target_user_idx = user_item_matrix.index.get_loc(target_user_id)
    distances, indices = model.kneighbors([user_item_matrix.iloc[target_user_idx]], n_neighbors=top_n+1)
    similar_user_indices = indices.flatten()[1:]
    recommended_product_ids = set()
    target_user_rated = set(user_item_matrix.columns[user_item_matrix.iloc[target_user_idx] > 0])
    for idx in similar_user_indices:
        similar_user_ratings = user_item_matrix.iloc[idx]
        unrated_by_target = (similar_user_ratings > 0) & (~user_item_matrix.columns.isin(target_user_rated))
        recommended_product_ids.update(user_item_matrix.columns[unrated_by_target])
        if len(recommended_product_ids) >= top_n:
            break
    recommended_items_details = data[data['productid'].isin(list(recommended_product_ids))][['name', 'ReviewCount', 'brand', 'image_url', 'Rating']]
    return recommended_items_details.head(top_n)

In [19]:
def Collaborative_based_recommendations(train_data, target_user_id, top_n=10):
    user_item_matrix = train_data.pivot_table(index='Id', columns='ProdId', values='Rating', aggfunc='mean').fillna(0)
    user_similarity = cosine_similarity(user_item_matrix)
    target_user_index = user_item_matrix.index.get_loc(target_user_id)
    user_similarities = user_similarity[target_user_index]
    similar_users_indices = user_similarities.argsort()[::-1][1:]
    recommended_items = []
    for user_index in similar_users_indices:
        rated_by_similar_user = user_item_matrix.iloc[user_index]
        not_rated_by_target_user = (rated_by_similar_user == 0) & (user_item_matrix.iloc[target_user_index] == 0)
        recommended_items.extend(user_item_matrix.columns[not_rated_by_target_user][:top_n])
    recommended_items_details = train_data[train_data['ProdId'].isin(recommended_items)][['Name', 'ReviewCount', 'Brand', 'Image', 'Rating']]
    return recommended_items_details.head(10)

In [None]:
def hybrid_recommendations(products_data,users_data, target_user_id, item_name, top_n=10):
    content_based_rec = Content_based_recommendations(products_data, item_name, top_n)
    user_data_db = pd.read_csv("User_data.csv")
    new_user = user_data_db[user_data_db['Id'] == target_user_id]
    collaborative_filtering_rec = Collaborative_based_recommendations(users_data, target_user_id, top_n)
    collaborative_model_filtering_rec = Collaborative_neighbours_based_recommendations(users_data,target_user_id,top_n)
    hybrid_recommendation = pd.concat([content_based_rec,collaborative_model_filtering_rec, collaborative_filtering_rec]).drop_duplicates()
    return hybrid_recommendation.head(10)