In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import spacy
from spacy.lang.fr import French
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from textblob import TextBlob
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re



In [2]:
measurements = {"g": ["gr.", "gr", "g", "g.", "gramme"], "kg": ["kg.", "kg", "kilogramme"], "mg": ["mg.", "mg", "miligramme"],
                "ml": ["ml.", "ml", "mililitre"], "cl": ["cl.", "cl", "centilitre"], "l": ["l.", "l", "litre"],
                "cuillère à soupe": ["c. à soupe", "cuillère à soupe", "cs", "cas", "càs", "c. à table", "cuillère à table"],
                "cuillère à café": ["c. à café", "cuillère à café", "cc", "cac", "càc", "c. à thé", "cuillère à thé"],
                "tasse": ["tasse"], "bol": ["bol"], "verre": ["verre"], "filet": ["filet"], "zeste": ["zeste"], "pièce": ["pièce"],
                "noisette": ["noisette"], "noix": ["noix"], "pincée": ["pincée"], "pointe":["pointe"], "poignée": ["poignée"],
                "feuille": ["feuille"], "branche": ["branche"], "gousse": ["gousse"], "tranche": ["tranche"], "cube": ["cube"],
                "boîte": ["boîte"], "barquette": ["barquette"], "pot": ["pot"], "bâtonnet": ["bâtonnet"], "boule": ["boule"],
                "rouleau": ["rouleau"], "p": ["p"]}

units = [item for sublist in measurements.values() for item in sublist]

quantities = { "½": 0.5, "1/2": 0.5, "1½": 1.5, "1 1/2": 0.5, "⅓": 0.33, "1/3": 0.33, "¼": 0.25, "1/5": 0.2, "¼": 0.25, "un demi": 0.5, "une demi": 0.5,"un et demi": 1.5,
              "tiers": 0.33, "quart": 0.25, "zero": 0, "deux": 2, "trois": 3, "quatre": 4,"cinq": 5, "six": 6, "sept": 7,
              "huit": 8, "neuf": 9, "dix": 10, "onze": 11, "douze": 12, "treize": 13, "quatorze": 14, "quinze": 15, "seize": 16,
              "dix-sept": 17, "dix-huit": 18, "dix-neuf": 19, "vingt": 20, "trente": 30, "quarante": 40, "cinquante": 50,
              "soixante": 60, "soixante-dix": 70, "quatre-vingt": 80, "quatre-vingt-dix": 90, "dizaine": 10, "une dizaine": 10,
              "douzaine": 12, "une douzaine": 12, "demi-douzaine": 6, "une demi-douzaine": 6, "vingtaine": 20, "une vingtaine": 20,
              "trentaine": 30, "quarantaine": 40, "cinquantaine": 50, "centaine": 100, "une centaine": 100, "cent": 100,
              "un": 1, "une": 1}

char_list = [i for i in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZéèàâêïù▪️"]


In [3]:
web_product_data = pd.read_csv ('/Users/vincentsalamand/Downloads/products_leclerc.csv')
print(len(web_product_data))

data = pd.DataFrame()
data['quantity_match'] = ""
data['unit_match'] = ""
data['price_match'] = ""
data['price_per_unit_match'] = ""
data['is_promo'] = ""
data['offer'] = ""
data = pd.concat([data, web_product_data], axis=1)
data['shelter'] = ""
data['store'] = "Leclerc"



6152


In [4]:
data['price_match'] = data.price.apply(lambda x: float(x.split()[0]))
data['price_per_unit_match'] = data.price_per_unit.apply(lambda x: float(x.split()[0]))
data.quantity_match = round(data.price_match / data.price_per_unit_match, 3)
#data['promo_price_per_unit_match'] = data.promo_price.apply(lambda x: float(re.sub("[^\d.]+", "", x)))

# Get names of indexes for which column price_per_unit_match is belo 0.03
missing_price_unit = data[data.price_per_unit_match < 0.03].index
# Delete these row indexes from dataFrame
data.drop(missing_price_unit , inplace=True)

In [5]:
# match description to find unit
nlp = French()

def convert_unit(value):
    results = []
    result = [unit for unit, item in measurements.items() if value in item]
    if len(result) > 0:
        return result[0]

measurement_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
measurement_patterns = list(nlp.tokenizer.pipe(list(units)))
measurement_matcher.add("MEASUREMENT_PATTERN", None, *measurement_patterns)
tokenizer = Tokenizer(nlp.vocab)

measurement_parser = []
price_per_unit = list(nlp.pipe(data.price_per_unit))

for line in price_per_unit:
    line = tokenizer(re.sub('[0-9]', ' ', line.text))
    matches = measurement_matcher(line)
    elements = []
    if len(matches) > 0:
        for match_id, start, end in matches:
            span = line[start:end]
            elements.append(span.text.lower())
        measurement_parser.append(convert_unit(elements[0]))
    else:
        text_blob_object = TextBlob(line.text)
        singular_line = ' '.join(text_blob_object.words.singularize())
        matches = measurement_matcher(tokenizer(singular_line))
        if len(matches) > 0:
            for match_id, start, end in matches:
                span = tokenizer(singular_line)[start:end]
                elements.append(span.text.lower())
            measurement_parser.append(convert_unit(elements[0]))
        else:
            plural_line = ' '.join(text_blob_object.words.pluralize())
            matches = measurement_matcher(tokenizer(plural_line))
            if len(matches) > 0:
                for match_id, start, end in matches:
                    span = tokenizer(plural_line)[start:end]
                    elements.append(span.text.lower())
                measurement_parser.append(convert_unit(elements[0]))
            else:
                 measurement_parser.append("")

data['unit_match'] = measurement_parser

In [6]:
# Add ean & is_frozen column based on db_products + clean_description to avoid duplicate process later on
db_products = pd.read_csv('/Users/vincentsalamand/Downloads/product_export.csv')
db_store_items = pd.read_csv('/Users/vincentsalamand/Downloads/store_items_export.csv')

db_products_ean = db_products[['id', 'ean', 'is_frozen']]
db_store_items_ean = db_store_items.merge(db_products_ean, left_on=['product_id'], right_on=['id'])
db_store_items_ean = db_store_items_ean[['clean_name', 'store_product_id', 'ean', 'is_frozen']]

data = pd.merge(data, db_store_items_ean.rename(columns={'store_product_id':'product_id'}), on='product_id',  how='left')


In [7]:
# find if product is in promo
# rename piece unit
# get shelter list

for index, product in data.iterrows():
    if data.loc[index].offer_description == data.loc[index].price:
        data.loc[index,'is_promo'] = False
        data.loc[index,'offer_description'] = np.nan
    else:
        data.loc[index,'is_promo'] = True
    if data.loc[index].unit_match == "p":
        data.loc[index,'unit_match'] = "pièce"
    if data.loc[index].is_frozen == "f":
        data.loc[index,'is_frozen'] = False   
    if data.loc[index].is_frozen == "t":
        data.loc[index,'is_frozen'] = True         
    shelter = []
    shelter.append(data.loc[index].shelter_main)
    shelter.append(data.loc[index].shelter_parent)
    shelter.append(data.loc[index].shelter_child)
    data.at[index,'shelter'] = shelter



In [8]:
from selenium import webdriver
import requests
from bs4 import BeautifulSoup, CData
import html
from multiprocessing.dummy import Pool 

driver = webdriver.Chrome("/Applications/chromedriver")

def scrape_leclerc_product_pages(data):
    for index, product in data.iterrows():
        if pd.isna(data.iloc[index].ean):
            url = product.url
            print(url)
            driver.get(url)
            soup = BeautifulSoup(driver.page_source)
            product_info = []
            page = html.unescape(soup.text)
            page = page[(page.find('objProduit')-2):(page.find('objContenu')-2)]
            for line in page.split(","):
                if "fSurgele" in line:
                    data.loc[data.url == url].is_frozen = line[11:]
                if "sCodeEAN" in line:
                    data.loc[data.url == url].ean = line[12:-1]

scrape_leclerc_product_pages(data)
#p = Pool(10)
#p.map(scrape_leclerc_product_pages, data.url)
#p.terminate()
#p.join()

https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-85458-Steak-viande-Limousine.aspx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-85459-Escalope-de-veau-Francais-.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-31372-Agneau-souris-.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-51382-Poitrine-tranchee-x7.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-85493-Pave-de-porc-Label-Rouge-.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-84800-Saute-de-porc.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-26098-Steak-hache-Ferial.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-105193-Steack-Hache-LExquis-Ferial.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-105201-Steack-Hachee-Charal.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-87812-Steack-hache-pur-boeuf---12mg.aspx
https

https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-93182-Pains-facon-Bretzel-Cereales-.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-37480-Brioche-PitchPasquier.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-43918-Galette-des-rois-Pasquier.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-55220-Couronne-des-rois-Pasquier-.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-40051-Crepes-bretonnes-Saint-Michel.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-92534-Madeleine-Cereal-Bio.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-12939-Lait-GrandLait-Bio-Candia.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-2613-Lait-Delisse-UHT-bouteille.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-69101-Lait

https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-109394-Yaourts-bio-Les-2-vaches.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-7973-Faisselle-Rians.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-107166-Faisselle-Rians.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-62357-Faisselle-Rians.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-68790-Yaourt-laitier-Bio-Village.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-84470-Yaourts-Bio-Danone.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-108238-Yaourt-brasse-bio-Les-2-Vaches.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-82309-Yaourt-brasse-Bio-Les-2-vaches.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-108238-Yaourt-brasse-bio-Les-2-Vache

https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-75523-Hache-vegetal-Ail-Persil-.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-104289-Quinoa-Ebly.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-114931-Riz-Uncle-Bens.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-91475-Riz-Comptoir-du-grain.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-114152-Pates-Bons-tuyaux-Jardin-Bio.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-8564-Pates-Linguine-Barilla.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-187-Pates-Fusilli-Barilla.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-8415-Pates-Coquillettes-Turini.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-48123-Pate-Radiatori-Garofalo.aspx
https://fd

https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-396-Cafe-Lavazza-Mattino.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-8853-Cappuccino-soluble-Plantation.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-110090-Doypack-Banania.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-111464-The-vert-Bio-Pages.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-42455-The-vert-Lipton.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-111460-Infusion-ligne-Bio-Pages.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-111459-Infusion-nuit-Bio-Pages.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-100363-Tisane-Digestion-naturel-Yogi.aspx
https://fd4-courses.leclercdrive.fr/magasin-127811-Bois-dArcy/fiche-produits-78400-Infusion-Yogi.aspx
https://fd4

In [12]:
data.head()

Unnamed: 0,quantity_match,unit_match,price_match,price_per_unit_match,is_promo,offer,shelter_main,shelter_parent,shelter_child,description1,description2,brand,price,price_per_unit,offer_description,origin,image_url,url,product_id,shelter,store,clean_name,ean,is_frozen
0,0.16,kg,4.85,30.31,False,,Viandes Poissons,Boucherie,Viande bovine,Viande bovine 1x faux filet ***,à griller Bio - 160g,,4.85 €,30.31 € / kg,,,https://fd4-photos.leclercdrive.fr/image.ashx?...,https://fd4-courses.leclercdrive.fr/magasin-12...,54134,"[Viandes Poissons, Boucherie, Viande bovine]",Leclerc,viande vin au filet île,3266110094989,False
1,0.28,kg,6.2,22.14,False,,Viandes Poissons,Boucherie,Viande bovine,Viande bovine 2x pavé ***,à griller Bio - 280g,,6.20 €,22.14 € / kg,,,https://fd4-photos.leclercdrive.fr/image.ashx?...,https://fd4-courses.leclercdrive.fr/magasin-12...,54136,"[Viandes Poissons, Boucherie, Viande bovine]",Leclerc,viande vin pavé île,3266110097003,False
2,0.16,kg,4.46,27.88,False,,Viandes Poissons,Boucherie,Viande bovine,Viande bovine Fr Bio Village,Faux filet - 160g,Bio Village,4.46 €,27.88 € / kg,,,https://fd4-photos.leclercdrive.fr/image.ashx?...,https://fd4-courses.leclercdrive.fr/magasin-12...,53023,"[Viandes Poissons, Boucherie, Viande bovine]",Leclerc,viande vin frisé la au filet,3661112098100,False
3,0.24,kg,4.99,20.79,False,,Viandes Poissons,Boucherie,Viande bovine,Viande bovine Bio vilage,Vbf - steak*** - 2x120g,,4.99 €,20.79 € / kg,,,https://fd4-photos.leclercdrive.fr/image.ashx?...,https://fd4-courses.leclercdrive.fr/magasin-12...,53022,"[Viandes Poissons, Boucherie, Viande bovine]",Leclerc,viande vin la steak,3661112098117,False
4,0.26,kg,5.4,20.77,False,,Viandes Poissons,Boucherie,Viande bovine,Viande bovine Bio 2x steak**,A griller - 260g,,5.40 €,20.77 € / kg,,France,https://fd4-photos.leclercdrive.fr/image.ashx?...,https://fd4-courses.leclercdrive.fr/magasin-12...,16504,"[Viandes Poissons, Boucherie, Viande bovine]",Leclerc,viande vin steak sarasin île,3266110093012,False


In [13]:
data["description"] = data.description1 + ' ' + data.description2

data.ean.fillna(0, inplace=True)
data.ean = data.ean.astype(int)
data = data.drop_duplicates(subset='ean', keep='first')


In [14]:
export_csv = data.to_csv (r'/Users/vincentsalamand/Downloads/leclerc_catalog.csv', index = None, header=True)


In [15]:
removed_items = db_store_items[~db_store_items.store_product_id.isin(data.product_id)]
len(removed_items)
#new_items = data[~data.product_id.isin(db_store_items.store_product_id)]
#len(new_items)
remaining_items = data[data.product_id.isin(db_store_items.store_product_id)]
len(remaining_items)
#df1[df1.product_id == 104789]

5064

In [11]:
db_foods = pd.read_csv('/Users/vincentsalamand/Downloads/food_export.csv')

food_vocab = [food.split() for food in db_foods.name]
food_vocab_uniq = set([item for sublist in food_vocab for item in sublist])
food_vocab_uniq

for index, product in data.iterrows():
    # get clean name only if not already there
    if pd.isna(data.iloc[index].clean_name):
        description = ''.join(product.description)    
        clean_description = []
        for word in description.lower().split():
            ratio = process.extractOne(word, food_vocab_uniq)
            if ratio[1] > 88:
                clean_description.append(ratio[0])
        data.loc[index,'clean_name']  = ' '.join(clean_description)










