In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import spacy
from spacy.lang.fr import French
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from textblob import TextBlob
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re



In [2]:
measurements = {"g": ["gr.", "gr", "g", "g.", "gramme"], "kg": ["kg.", "kg", "kilogramme"], "mg": ["mg.", "mg", "miligramme"],
                "ml": ["ml.", "ml", "mililitre"], "cl": ["cl.", "cl", "centilitre"], "l": ["l.", "l", "litre"],
                "cuillère à soupe": ["c. à soupe", "cuillère à soupe", "cs", "cas", "càs", "c. à table", "cuillère à table"],
                "cuillère à café": ["c. à café", "cuillère à café", "cc", "cac", "càc", "c. à thé", "cuillère à thé"],
                "tasse": ["tasse"], "bol": ["bol"], "verre": ["verre"], "filet": ["filet"], "zeste": ["zeste"], "pièce": ["pièce"],
                "noisette": ["noisette"], "noix": ["noix"], "pincée": ["pincée"], "pointe":["pointe"], "poignée": ["poignée"],
                "feuille": ["feuille"], "branche": ["branche"], "gousse": ["gousse"], "tranche": ["tranche"], "cube": ["cube"],
                "boîte": ["boîte"], "barquette": ["barquette"], "pot": ["pot"], "bâtonnet": ["bâtonnet"], "boule": ["boule"],
                "rouleau": ["rouleau"], "p": ["p"]}

units = [item for sublist in measurements.values() for item in sublist]

quantities = { "½": 0.5, "1/2": 0.5, "1½": 1.5, "1 1/2": 0.5, "⅓": 0.33, "1/3": 0.33, "¼": 0.25, "1/5": 0.2, "¼": 0.25, "un demi": 0.5, "une demi": 0.5,"un et demi": 1.5,
              "tiers": 0.33, "quart": 0.25, "zero": 0, "deux": 2, "trois": 3, "quatre": 4,"cinq": 5, "six": 6, "sept": 7,
              "huit": 8, "neuf": 9, "dix": 10, "onze": 11, "douze": 12, "treize": 13, "quatorze": 14, "quinze": 15, "seize": 16,
              "dix-sept": 17, "dix-huit": 18, "dix-neuf": 19, "vingt": 20, "trente": 30, "quarante": 40, "cinquante": 50,
              "soixante": 60, "soixante-dix": 70, "quatre-vingt": 80, "quatre-vingt-dix": 90, "dizaine": 10, "une dizaine": 10,
              "douzaine": 12, "une douzaine": 12, "demi-douzaine": 6, "une demi-douzaine": 6, "vingtaine": 20, "une vingtaine": 20,
              "trentaine": 30, "quarantaine": 40, "cinquantaine": 50, "centaine": 100, "une centaine": 100, "cent": 100,
              "un": 1, "une": 1}

char_list = [i for i in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZéèàâêïù▪️"]


In [11]:
#web_product_data = pd.read_csv ('/Users/vincentsalamand/Downloads/products_carrefour.csv')
print(len(web_product_data))


15124


In [4]:
data = pd.DataFrame()
data['quantity_match'] = ""
data['unit_match'] = ""
data['price_match'] = ""
data['price_per_unit_match'] = ""
data['is_promo'] = ""
data = pd.concat([data, web_product_data], axis=1)
data['shelter'] = ""
data['store'] = "Carrefour"



In [5]:
# clean prices
data['price_match'] = data.price.apply(lambda x: float(x[:-1].replace(',','.')))
data['price_per_unit_match'] = data.price_per_unit.apply(lambda x: float(x.replace(',', '').split()[0]))
data.quantity_match = round(data.price_match / data.price_per_unit_match, 3)

# set urls
data['image_url'] = data.image_url.apply(lambda x: 'https://www.carrefour.fr' + x)
data['url'] = data.url.apply(lambda x: 'https://www.carrefour.fr' + x)



In [6]:
# match description to find unit
nlp = French()

def convert_unit(value):
    results = []
    result = [unit for unit, item in measurements.items() if value in item]
    if len(result) > 0:
        return result[0]

measurement_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
measurement_patterns = list(nlp.tokenizer.pipe(list(units)))
measurement_matcher.add("MEASUREMENT_PATTERN", None, *measurement_patterns)
tokenizer = Tokenizer(nlp.vocab)

measurement_parser = []
price_per_unit = list(nlp.pipe(data.price_per_unit))

for line in price_per_unit:
    line = tokenizer(re.sub('[0-9]', ' ', line.text))
    matches = measurement_matcher(line)
    elements = []
    if len(matches) > 0:
        for match_id, start, end in matches:
            span = line[start:end]
            elements.append(span.text.lower())
        measurement_parser.append(convert_unit(elements[0]))
    else:
        text_blob_object = TextBlob(line.text)
        singular_line = ' '.join(text_blob_object.words.singularize())
        matches = measurement_matcher(tokenizer(singular_line))
        if len(matches) > 0:
            for match_id, start, end in matches:
                span = tokenizer(singular_line)[start:end]
                elements.append(span.text.lower())
            measurement_parser.append(convert_unit(elements[0]))
        else:
            plural_line = ' '.join(text_blob_object.words.pluralize())
            matches = measurement_matcher(tokenizer(plural_line))
            if len(matches) > 0:
                for match_id, start, end in matches:
                    span = tokenizer(plural_line)[start:end]
                    elements.append(span.text.lower())
                measurement_parser.append(convert_unit(elements[0]))
            else:
                 measurement_parser.append("")

data['unit_match'] = measurement_parser

In [7]:
# find if product is in promo
# rename piece unit
# get shelter list

for index, product in data.iterrows():
    if data.loc[index].offer == "Promotion":
        data.loc[index,'is_promo'] = True
    else:
        data.loc[index,'is_promo'] = False
    if data.loc[index].unit_match == "p":
        data.loc[index,'unit_match'] = "pièce"
    if data.loc[index].shelter_main == "Surgelés":
        data.loc[index,'is_frozen'] = True        
    shelter = []
    shelter.append(data.loc[index].shelter_main)
    shelter.append(data.loc[index].shelter_parent)
    shelter.append(data.loc[index].shelter_child)
    data.at[index,'shelter'] = shelter

In [12]:
data = data.drop_duplicates(subset='ean', keep='first')
len(data)

12822

In [15]:
data["description"] = data.description1 + ' ' + data.description2

In [16]:
export_csv = data.to_csv (r'/Users/vincentsalamand/Downloads/carrefour_catalog.csv', index = None, header=True)


In [76]:
# clean name

db_foods = pd.read_csv('/Users/vincentsalamand/Downloads/food_export.csv')


food_vocab = [food.split() for food in db_foods.name]
food_vocab_uniq = set([item for sublist in food_vocab for item in sublist])
food_vocab_uniq

for index, product in data.iterrows():
    # get clean name only if not already there
    if not data.iloc[index].clean_name:
        clean_description = []
        for word in product["name"].lower().split():
            ratio = process.extractOne(word, food_vocab_uniq)
            if ratio[1] > 88:
                clean_description.append(ratio[0])
        data.loc[index,'clean_name']  = ' '.join(clean_description)




















KeyboardInterrupt: 