# Prerequisites

## Prepare environment

These commands will install all required dependencies and download dataset.

In [None]:
!rm -f ./products.csv

# cope information about required programming libraries for Python
!gsutil cp gs://dct-tt-6-receng-wrkshp/code/requirements.txt requirements.txt 

# copy data from cloud to the environment of the notebook
!gsutil cp gs://dct-tt-6-receng-wrkshp/dataset/big_basket/products.zip products.zip

# unpack the dataset
!unzip -j ./products.zip

# install Python librraies for this exercise
!pip install -r requirements.txt

In [None]:
# import libraries to the programing environment
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Loading data
Load data to panda data frame.

In [None]:
df = pd.read_csv('products.csv')
rows1, cols1 = df.shape
print(f"""The loaded dataset has:
\t- {rows1} rows
\t- {cols1} columns""")

How data looks like.

Columns:
* **index** - Simply the Index
* **product** - Title of the product (as they're listed)
* **category** - Category into which product has been classified
* **sub_category** - Subcategory into which product has been kept
* **brand** - Brand of the product
* **sale_price** - Price at which product is being sold on the site
* **market_price** - Market price of the product
* **type** - Type into which product falls
* **rating** - Rating the product has got from its consumers
* **description** - Description of the dataset (in detail)

In [None]:
df["product_id"] = df["index"]
df = df.set_index("product_id")
df.drop("index", axis=1, inplace=True)

In [None]:
df.head(10)

# Find similarities in data


## Handling missing values

In our exercise we can just drop all empty values.

In [None]:
print("There are empty values in dataset:")
empty_stats = df.isnull().sum()
empty_columns = [(key, empty_stats[key]) for key in empty_stats.keys() if empty_stats[key] > 0]
for col in empty_columns:
  print(f"\t- column '{col[0]}' has {col[1]} empty values")

Dropping empty values.

In [None]:
df = df.dropna()
rows2, cols2 = df.shape
print(f"""The cleared dataset has:
\t- {rows2} rows, {rows1-rows2} rows less
\t- {cols2} columns""")

## Handling outliers

In order to do this we need to understand data distributions. We will test product characteristics:
* category
* sub_category
* brand
* type
* rating

In [None]:
def show_hist(col_name):
  # calculate number of unique values
  counts = df[col_name].value_counts()
  # normilize number of unique values (percentage)
  counts_norm = df[col_name].value_counts(normalize=True)
  distr_df = pd.DataFrame({"counts": counts, "counts_norm": counts_norm}).sort_values("counts", ascending=False)
  # show a graph chart
  distr_df.plot(kind='bar', xticks=[], y="counts")

In [None]:
show_hist("category")

In [None]:
show_hist("sub_category")

In [None]:
show_hist("brand")

In [None]:
show_hist("type")

In [None]:
show_hist("rating")

### Normalization

MinMax normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['sale_price_norm', 'market_price_norm']] = scaler.fit_transform(df[['sale_price', 'market_price']])

## Converting categorical variables to numerical

All product characteristics presented as literal values will be converted to numerical.

In [None]:
df_original = df.copy()

### Discretization

We will replace literal values with numbers. Two columns can be converted to numeric with this method: "category" and "brand".

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df["category_numeric"] = le.fit_transform(df['category'])
df["brand_numeric"] = le.fit_transform(df['brand'])

Result of the operation:

In [None]:
pd.DataFrame(df[["category", "category_numeric"]].value_counts()).sort_values("category_numeric", axis=0).head(5)

In [None]:
pd.DataFrame(df[["brand", "brand_numeric"]].value_counts()).sort_values("brand_numeric", axis=0).head(5)

### Encode categorical values

There are other columns which needs additional transformation before convertion to numeric. These columns are "sub_category" and "type".

In [None]:
from sklearn.preprocessing import OneHotEncoder

def encode_column(df, col_name):
  ohe = OneHotEncoder()
  df_encoded = ohe.fit_transform(df[[col_name]]).toarray()
  df_encoded = pd.DataFrame(df_encoded, columns=ohe.get_feature_names_out([col_name]))
  df = pd.merge(df.reset_index(), df_encoded, left_index=True, right_index=True, how='inner')
  df = df.set_index("product_id")
  return df

In [None]:
df = encode_column(df, "category_numeric")
df = encode_column(df, "brand_numeric")

Non trivial cases with sub categories and other types with complex names.

In [None]:
df[df['sub_category'].str.contains('care', case=False, na=False)]['sub_category'].unique()

In [None]:
df[df['type'].str.contains('care', case=False, na=False)]['type'].unique()

We will extract additional features from existing ones to have more characteristics to calculate product similarity.

In [None]:
df["subcat_men"] = df['sub_category'].str.contains('men', case=False, na=False).astype(int)

In [None]:
df[["sub_category", "subcat_men"]][df["subcat_men"]==1].head()

In [None]:
df[["sub_category", "subcat_men"]][df["subcat_men"]==0].head()

Extracting featires from sub category.

In [None]:
subcat_groups = ["men", "women", "care", "hair", "skin", "oral"]
for group in subcat_groups:
  df[f"subcat_{group}"] = df['sub_category'].str.contains(group, case=False, na=False).astype(int)

Extracting featires from type.

In [None]:
type_groups = ["body", "hair", "face", "men", "women", "care", "health", "supplements", "shampoo"]
for group in type_groups:
  df[f"type_{group}"] = df['type'].str.contains(group, case=False, na=False).astype(int)

In [None]:
food_words = ["food", "chocolate", "fruit", "snack", "vegetable", "sugar", "noodle", "baguette", "tea", "popcorn", "drink", "seed", "spice", "meal", "milk", "lunch", "cookie", "sauce"]
df["type_food"] = df['type'].str.contains('|'.join(food_words), case=False, na=False).astype(int)

All additinal columns could be seen in the information about data frame structure.

In [None]:
df.dtypes

## Building recommender model

### Using distance metrics

Before calculation we need to adjust data frame structure ot get unique product identifies and be able to search for exact product later.

In [None]:
# try to find similar product manully
cols = ["product", "sub_category", "type", "sale_price", "sale_price_norm"] + [col for col in df.columns if "type_" in col or "subcat_" in col or "category_" in col or "brand_" in col]
df[df["subcat_care"]==1][cols]

In [None]:
cols = ["sale_price_norm", "market_price_norm"] + [col for col in df.columns if "type_" in col or "subcat_" in col or "category_" in col or "brand_" in col]
product_features = df[cols]
product_features = product_features.drop(["brand_numeric", "category_numeric"], axis=1)

#### Cosine distance

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(product_features)

# Convert cosine similarity matrix to data frame
cosine_sim_df = pd.DataFrame(cosine_sim, index=df.index, columns=df.index)

In [None]:
def recommend_products(number, product_id=None):
  cols = ["product", "category", "sub_category", "brand", "sale_price", "market_price", "type", "rating", "description"]
  if product_id:
    ser = cosine_sim_df[cosine_sim_df.index==product_id].squeeze()
    s_sorted = ser.sort_values(ascending=False)
    min_field_names = s_sorted.nlargest(number+1).index.tolist()
    return df[df.index.isin(min_field_names)][cols]
  # if product is not specified, we will use the top rated products
  return df.sort_values(by=["rating", "sale_price_norm"], ascending=False).head(number)[cols]

In [None]:
recommend_products(5, 7)

In [None]:
recommend_products(5, 39)

In [None]:
df[df["type_men"]==1].head()

Store data to GCS bucket to use it in API

In [None]:
cosine_sim_df.columns = cosine_sim_df.columns.astype(str)
cosine_sim_df.to_parquet("product_distances.parquet.gzip", compression='gzip')

In [None]:
df = pd.read_parquet('product_distances.parquet.gzip')

In [None]:
df.head()