## Création du dataset products

In [1]:
import pymongo
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import string
from unidecode import unidecode
import warnings 
warnings.filterwarnings("ignore")

## Chargement du dataset categories

In [2]:
# chargement du df_categories, nous l'utiliserons pour récupérer cat_id
df_cat = pd.read_csv("dataset/categories.csv")

# récupération des category_ID où category_parent is not null
category_ID_with_parent = df_cat[df_cat['Category_parent'].notnull()]['Category_ID'].tolist()

## Définition d'une fonction qui supprimera les enregistrements d'une colonne où la valeur est None

In [3]:
def remove_rows_with_none(df, column_name):
    return df.dropna(subset=[column_name])

In [4]:
def remove_duplicates_by_column(df, column_name1, column_name2):
    return  df.drop_duplicates(subset=[column_name1, column_name2])

## Connexion à ma base de données mongodb

In [5]:
# Connexion à MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/") 
database = client["dbProductElunea"] 

## Génération des données de notre dataframe Product

In [6]:
# une fonction qui retourne cat_id d'une catégorie qu'elle reçoit en paramètre
def search_cat_id(df_cat, cat, owner_cat1) :
    cat1= cat
    owner_cat = owner_cat1.strip()
    
    if( cat == "Parfums Femmes") :
        cat1 = "PARFUMS FEMME"
    if (cat == "Coffret parfum"):
        cat1 = "Coffret Parfum"
    if (cat == "Types de soin".title()):
        cat1 = "Type de soin"
    if (cat == "Masque Visage"):
        cat1 = "Masque Visage"
    if(cat == "Soin Corporel"):
        cat1 ="SOIN DU CORPS"
    if (cat == "Besoins" and owner_cat == "Bain & Douche"):
        cat1 = "BESOINS"
        owner_cat = "Corps & Bains" 
        cat_u = unidecode(cat)
        result = df_cat[(df_cat["Category_name"] == "Besoins") & (df_cat["Category_parent"] == "Corps & Bains")]
        
        if result.empty:
            # Le DataFrame est vide
            return cat
        else:
            cat_ID = result["Category_ID"].iloc[0]
            return cat_ID
        
    if (cat == "Solaire" and owner_cat == "Corps & Bains"):
        result = df_cat[(df_cat["Category_name"]== "Solaire") & (df_cat["Category_parent"] == "Corps & Bains")]
        
        if result.empty:
            # Le DataFrame est vide
            return cat
        else:
            cat_ID = result["Category_ID"].iloc[0]
            return cat_ID
        
        
    if (cat == "Routine Capillaire".title()):
        cat1 = "ROUTINES CAPILLAIRES"
        
    if (cat == "BEURRE & HUILE".title()):
        cat1 = "BEURRE & HUILES"
        
    if (cat == "BEURRE & HUILES".title()):
        cat = "ACCESOIRES CAPILLAIRES"
    
    if (cat == "Coiffure".title()):
        cat1 = "COIFFURES"
        
    cat_u = unidecode(cat1).title()
    
    result = df_cat[df_cat["Category_name"].apply(unidecode) == cat_u]
    
    if result.empty:
        # Le DataFrame est vide
        return cat
    else:
        cat_ID = result["Category_ID"].iloc[0]
        return cat_ID

In [7]:
### Accédons à la collection 
col_products = database["cat_with_products"]

# Lecture des données chargées
documents = col_products.find()
data_products = []
for document in documents:
    data_products.append(document)
    


all_products = [] # contiendra tous les produits
count = 1
for i in range(len(data_products)) : # toute la collection
    dic_product = {}
    cat_product_current = data_products[i] # un dictionnaire
    all_product_current = cat_product_current["all_products"] # une liste
    for j in range(len(all_product_current)) :
        dic_product = {}
        dic_product_current = all_product_current[j] # un dictionnaire
        if dic_product_current["product_price"] is not None :
            dic_product["Product_ID"] = f"P{count:06d}"
            dic_product["Product"] = dic_product_current["product_id"]
            cat = dic_product_current["main_category"].strip().title()
            owner_cat = dic_product_current["owner_category"].strip().title()
            dic_product["Category_ID"] = search_cat_id(df_cat, cat, owner_cat) 
            dic_product["Product_name"] = dic_product_current["name_product"].strip().title()
            dic_product["Description"] = cat
            dic_product["Price"] = int(dic_product_current["product_price"].replace(".", " ").replace(" ",""))
            dic_product["Currency"] = "FCFA"
            if (len(dic_product["Category_ID"])>0  and dic_product["Category_ID"] in category_ID_with_parent) :
                all_products.append(dic_product)
                count = count +1 
            

# création du dataframe product
lst_columns = ["Product_ID", "Product", "Category_ID", "Product_name", "Description", "Price", "Currency"]
df_products = pd.DataFrame(columns = lst_columns )
for p in all_products :
    df_products = df_products.append(p, ignore_index=True)
    
# suppression des lignes où category_ID est None
df_products = remove_rows_with_none(df_products, "Category_ID")
df_products = remove_duplicates_by_column(df_products, "Product_ID", "Category_ID")

# Enregistrement dans un fichier csv
df_products.to_csv("dataset/products.csv",encoding='utf-8-sig', index=False)

In [8]:
df_products.head()

Unnamed: 0,Product_ID,Product,Category_ID,Product_name,Description,Price,Currency
0,P000001,9628,Cat_0002,Armani Luminous Silk Fond De Teint Eclat Parfait,Teint,30000,FCFA
1,P000002,16125,Cat_0002,Armani Luminous Silk Poudre Fusion Eclat,Teint,33000,FCFA
2,P000003,5043,Cat_0002,Artist By Nocibe Fond De Teint Goutte A Goutte,Teint,12000,FCFA
3,P000004,10165,Cat_0002,Bareminerals Fond De Teint Liquide Tenue Perfo...,Teint,25000,FCFA
4,P000005,16583,Cat_0002,Beauty Concepts Xxl Bronzer Illuminant,Teint,10000,FCFA


In [9]:
len(df_products)

1286

In [None]:
df_products