In [1]:
import pandas as pd
import numpy as np
import spacy
from spacy.lang.fr import French
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from textblob import TextBlob
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

In [1222]:
product_data = pd.read_csv ('/Users/vincentsalamand/Downloads/products_leclerc.csv')
print(len(product_data))
foods = pd.read_csv('/Users/vincentsalamand/Downloads/food_export.csv')
print(len(foods))
food_categories = pd.read_csv('/Users/vincentsalamand/Downloads/category_export.csv')
print(len(food_categories))
product_categories = np.append(product_data.shelter_parent.unique(), product_data.shelter_child.unique()) 
print(len(product_categories))

8173
1096
47
318


In [1160]:
measurements = {"g": ["gr.", "gr", "g", "g.", "gramme"], "kg": ["kg.", "kg", "kilogramme"], "mg": ["mg.", "mg", "miligramme"],
                "ml": ["ml.", "ml", "mililitre"], "cl": ["cl.", "cl", "centilitre"], "l": ["l.", "l", "litre"],
                "cuillère à soupe": ["c. à soupe", "cuillère à soupe", "cs", "cas", "càs", "c. à table", "cuillère à table"],
                "cuillère à café": ["c. à café", "cuillère à café", "cc", "cac", "càc", "c. à thé", "cuillère à thé"],
                "tasse": ["tasse"], "bol": ["bol"], "verre": ["verre"], "filet": ["filet"], "zeste": ["zeste"], "pièce": ["pièce"],
                "noisette": ["noisette"], "noix": ["noix"], "pincée": ["pincée"], "pointe":["pointe"], "poignée": ["poignée"],
                "feuille": ["feuille"], "branche": ["branche"], "gousse": ["gousse"], "tranche": ["tranche"], "cube": ["cube"],
                "boîte": ["boîte"], "barquette": ["barquette"], "pot": ["pot"], "bâtonnet": ["bâtonnet"], "boule": ["boule"],
                "rouleau": ["rouleau"], "p": ["p"]}

units = [item for sublist in measurements.values() for item in sublist]


In [1161]:
quantities = { "½": 0.5, "1/2": 0.5, "1½": 1.5, "1 1/2": 0.5, "⅓": 0.33, "1/3": 0.33, "¼": 0.25, "1/5": 0.2, "¼": 0.25, "un demi": 0.5, "une demi": 0.5,"un et demi": 1.5,
              "tiers": 0.33, "quart": 0.25, "zero": 0, "deux": 2, "trois": 3, "quatre": 4,"cinq": 5, "six": 6, "sept": 7,
              "huit": 8, "neuf": 9, "dix": 10, "onze": 11, "douze": 12, "treize": 13, "quatorze": 14, "quinze": 15, "seize": 16,
              "dix-sept": 17, "dix-huit": 18, "dix-neuf": 19, "vingt": 20, "trente": 30, "quarante": 40, "cinquante": 50,
              "soixante": 60, "soixante-dix": 70, "quatre-vingt": 80, "quatre-vingt-dix": 90, "dizaine": 10, "une dizaine": 10,
              "douzaine": 12, "une douzaine": 12, "demi-douzaine": 6, "une demi-douzaine": 6, "vingtaine": 20, "une vingtaine": 20,
              "trentaine": 30, "quarantaine": 40, "cinquantaine": 50, "centaine": 100, "une centaine": 100, "cent": 100,
              "un": 1, "une": 1}

In [1162]:
char_list = [i for i in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZéèàâêïù▪️"]


In [1163]:
data = pd.DataFrame()
data['quantity_match'] = ""
data['unit_match'] = ""
#data['food_match'] = ""
data['price_match'] = ""
data['price_per_unit_match'] = ""
data['is_promo'] = ""
data['promo_price_per_unit_match'] = ""
data['clean_description'] = ""
data = pd.concat([data, product_data], axis=1)
data['shelter'] = ""
data['ean'] = ""
data['is_frozen'] = ""
data['is_available'] = ""
pd.set_option('display.max_colwidth', 100)

In [1218]:
data['price_match'] = data.price.apply(lambda x: float(x.split()[0]))
data['price_per_unit_match'] = data.price_per_unit.apply(lambda x: float(x.split()[0]))
data.quantity_match = round(data.price_match / data.price_per_unit_match, 3)
data['promo_price_per_unit_match'] = data.promo_price.apply(lambda x: float(re.sub("[^\d.]+", "", x)))

In [1175]:
for index, product in data.iterrows():
    if data.loc[index].promo_price == data.loc[index].price:
        data.loc[index,'is_promo'] = False
    else:
        data.loc[index,'is_promo'] = True
    if data.loc[index].unit_match == "p":
        data.loc[index,'unit_match'] = "pièce"
    shelter = []
    shelter.append(data.loc[index].shelter_main)
    shelter.append(data.loc[index].shelter_parent)
    shelter.append(data.loc[index].shelter_child)
    data.at[index,'shelter'] = shelter

In [1166]:
nlp = French()
#products = list(nlp.pipe(data.description))
#price_per_unit = list(nlp.pipe(data.price_per_unit))

In [1167]:
def convert_unit(value):
    results = []
    result = [unit for unit, item in measurements.items() if value in item]
    if len(result) > 0:
        return result[0]

In [1168]:
measurement_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
measurement_patterns = list(nlp.tokenizer.pipe(list(units)))
measurement_matcher.add("MEASUREMENT_PATTERN", None, *measurement_patterns)
tokenizer = Tokenizer(nlp.vocab)

measurement_parser = []
price_per_unit = list(nlp.pipe(data.price_per_unit))

for line in price_per_unit:
    line = tokenizer(re.sub('[0-9]', ' ', line.text))
    matches = measurement_matcher(line)
    elements = []
    if len(matches) > 0:
        for match_id, start, end in matches:
            span = line[start:end]
            elements.append(span.text.lower())
        measurement_parser.append(convert_unit(elements[0]))
    else:
        text_blob_object = TextBlob(line.text)
        singular_line = ' '.join(text_blob_object.words.singularize())
        matches = measurement_matcher(tokenizer(singular_line))
        if len(matches) > 0:
            for match_id, start, end in matches:
                span = tokenizer(singular_line)[start:end]
                elements.append(span.text.lower())
            measurement_parser.append(convert_unit(elements[0]))
        else:
            plural_line = ' '.join(text_blob_object.words.pluralize())
            matches = measurement_matcher(tokenizer(plural_line))
            if len(matches) > 0:
                for match_id, start, end in matches:
                    span = tokenizer(plural_line)[start:end]
                    elements.append(span.text.lower())
                measurement_parser.append(convert_unit(elements[0]))
            else:
                 measurement_parser.append("")

data['unit_match'] = measurement_parser

In [1181]:
food_vocab = [food.split() for food in foods.name]
food_vocab_uniq = set([item for sublist in food_vocab for item in sublist])
food_vocab_uniq

for index, product in data.iterrows():
    description = ''.join(product.description)    
    clean_description = []
    for word in description.lower().split():
        ratio = process.extractOne(word, food_vocab_uniq)
        if ratio[1] > 87:
            clean_description.append(ratio[0])
    data.loc[index,'clean_description']  = ' '.join(clean_description)












































































































































































In [1093]:
from selenium import webdriver
from bs4 import BeautifulSoup, CData
import html

driver = webdriver.Chrome("/Applications/chromedriver")

for index, item in data.iterrows():
    if item.ean == '':
        driver.get(item.url)
        soup=BeautifulSoup(driver.page_source)
        product_info = []
        page = html.unescape(soup.text)
        page = page[(page.find('objProduit')-2):(page.find('objContenu')-2)]
        for line in page.split(","):
            if "fSurgele" in line:
                data.loc[index,'is_frozen'] = line[11:]
            if "sCodeEAN" in line:
                data.loc[index,'ean'] = line[12:-1]


In [1221]:
export_csv = data.to_csv (r'/Users/vincentsalamand/Downloads/leclerc_catalog.csv', index = None, header=True)
