In [154]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import spacy
from spacy.lang.fr import French
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from textblob import TextBlob
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

In [155]:
measurements = {"g": ["gr.", "gr", "g", "g.", "grams", "gramme"], "kg": ["kg.", "kg", "kilogramme"], "mg": ["mg.", "mg", "miligramme"],
                "ml": ["ml.", "ml", "milliliters", "mililitre"], "cl": ["cl.", "cl", "centilitre"], "l": ["l.", "l", "litre"],
                "cuillère à soupe": ["c. à soupe", "cuillère à soupe", "cs", "cas", "càs", "c. à table", "cuillère à table"],
                "cuillère à café": ["c. à café", "cuillère à café", "cc", "cac", "càc", "c. à thé", "cuillère à thé"],
                "tasse": ["tasse"], "bol": ["bol"], "verre": ["verre"], "filet": ["filet"], "zeste": ["zeste"], "pièce": ["pièce", "unité", "ampollas", "set"],
                "noisette": ["noisette"], "noix": ["noix"], "pincée": ["pincée"], "pointe":["pointe"], "poignée": ["poignée"],
                "feuille": ["feuille"], "branche": ["branche"], "gousse": ["gousse"], "tranche": ["tranche"], "cube": ["cube"],
                "boîte": ["boîte"], "barquette": ["barquette"], "pot": ["pot"], "bâtonnet": ["bâtonnet"], "boule": ["boule"],
                "rouleau": ["rouleau"], "p": ["p"]}

units = [item for sublist in measurements.values() for item in sublist]

quantities = { "½": 0.5, "1/2": 0.5, "1½": 1.5, "1 1/2": 0.5, "⅓": 0.33, "1/3": 0.33, "¼": 0.25, "1/5": 0.2, "¼": 0.25, "un demi": 0.5, "une demi": 0.5,"un et demi": 1.5,
              "tiers": 0.33, "quart": 0.25, "zero": 0, "deux": 2, "trois": 3, "quatre": 4,"cinq": 5, "six": 6, "sept": 7,
              "huit": 8, "neuf": 9, "dix": 10, "onze": 11, "douze": 12, "treize": 13, "quatorze": 14, "quinze": 15, "seize": 16,
              "dix-sept": 17, "dix-huit": 18, "dix-neuf": 19, "vingt": 20, "trente": 30, "quarante": 40, "cinquante": 50,
              "soixante": 60, "soixante-dix": 70, "quatre-vingt": 80, "quatre-vingt-dix": 90, "dizaine": 10, "une dizaine": 10,
              "douzaine": 12, "une douzaine": 12, "demi-douzaine": 6, "une demi-douzaine": 6, "vingtaine": 20, "une vingtaine": 20,
              "trentaine": 30, "quarantaine": 40, "cinquantaine": 50, "centaine": 100, "une centaine": 100, "cent": 100,
              "un": 1, "une": 1}

char_list = [i for i in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZéèàâêïù▪️"]


In [156]:
web_product_data = pd.read_csv ('/Users/vincentsalamand/Downloads/products_amazonprimenow.csv')
print(len(web_product_data))

3530


In [157]:
data = pd.DataFrame()
data['quantity_match'] = ""
data['unit_match'] = ""
data['price_match'] = ""
data['price_per_unit_match'] = ""
data = pd.concat([data, web_product_data], axis=1)
data['shelter'] = ""



In [158]:
# clean prices
data['price_match'] = data.price.apply(lambda x: float(x[:-1].replace(',','.')))
data['price_per_unit_match'] = data.price_per_unit.apply(lambda x: float(x[1:-1].replace(',', '.').split()[0]))
data.quantity_match = round(data.price_match / data.price_per_unit_match, 3)

data['price_per_unit'] = data.price_per_unit.apply(lambda x: x.replace('/', ' ').lower())


In [159]:
# match description to find unit
nlp = French()

def convert_unit(value):
    results = []
    result = [unit for unit, item in measurements.items() if value in item]
    if len(result) > 0:
        return result[0]

measurement_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
measurement_patterns = list(nlp.tokenizer.pipe(list(units)))
measurement_matcher.add("MEASUREMENT_PATTERN", None, *measurement_patterns)
tokenizer = Tokenizer(nlp.vocab)

measurement_parser = []
price_per_unit = list(nlp.pipe(data.price_per_unit))

for line in price_per_unit:
    line = tokenizer(re.sub('[0-9]', ' ', line.text))
    matches = measurement_matcher(line)
    elements = []
    if len(matches) > 0:
        for match_id, start, end in matches:
            span = line[start:end]
            elements.append(span.text.lower())
        measurement_parser.append(convert_unit(elements[0]))
    else:
        text_blob_object = TextBlob(line.text)
        singular_line = ' '.join(text_blob_object.words.singularize())
        matches = measurement_matcher(tokenizer(singular_line))
        if len(matches) > 0:
            for match_id, start, end in matches:
                span = tokenizer(singular_line)[start:end]
                elements.append(span.text.lower())
            measurement_parser.append(convert_unit(elements[0]))
        else:
            plural_line = ' '.join(text_blob_object.words.pluralize())
            matches = measurement_matcher(tokenizer(plural_line))
            if len(matches) > 0:
                for match_id, start, end in matches:
                    span = tokenizer(plural_line)[start:end]
                    elements.append(span.text.lower())
                measurement_parser.append(convert_unit(elements[0]))
            else:
                 measurement_parser.append("")

data['unit_match'] = measurement_parser

In [160]:

# get shelter list

for index, product in data.iterrows():       
    shelter = []
    shelter.append(data.loc[index].shelter_parent)    
    shelter.append(data.loc[index].shelter_main)
    data.at[index,'shelter'] = shelter


In [163]:
export_csv = data.to_csv (r'/Users/vincentsalamand/Downloads/amazonprimenow_catalog.csv', index = None, header=True)


In [164]:
data[data['unit_match'] == "g"]

Unnamed: 0,quantity_match,unit_match,price_match,price_per_unit_match,shelter_main,shelter_parent,brand,description1,description,price,price_per_unit,promo_price,store,image_url,url,ean,shelter
15,2.475,g,0.99,0.40,Légumes frais,Fruits et légumes,,naturalia panais vrac bio - 250 gr,NATURALIA Panais vrac Bio - 250 Gr,"0,99 €","(0,40 € 100 g)","5,39 €",Naturalia,https://m.media-amazon.com/images/I/51-7U+GECq...,https://primenow.amazon.fr/dp/B07XVD4HQ1?qid=1...,B07XVD4HQ1,"[Fruits et légumes, Légumes frais]"
22,1.992,g,2.49,1.25,Légumes frais,Fruits et légumes,,naturalia champignons blonds bio - barquette 2...,NATURALIA Champignons blonds Bio - Barquette 2...,"2,49 €","(1,25 € 100 g)","5,39 €",Naturalia,https://m.media-amazon.com/images/I/613L9TtQRU...,https://primenow.amazon.fr/dp/B07WJCGBMQ?qid=1...,B07WJCGBMQ,"[Fruits et légumes, Légumes frais]"
25,2.475,g,0.99,0.40,Légumes frais,Fruits et légumes,,naturalia navet - 250g,Naturalia Navet - 250g,"0,99 €","(0,40 € 100 g)","5,39 €",Naturalia,https://m.media-amazon.com/images/I/61JVdWoRBZ...,https://primenow.amazon.fr/dp/B07WJCGRSN?qid=1...,B07WJCGRSN,"[Fruits et légumes, Légumes frais]"
28,1.498,g,3.19,2.13,Légumes frais,Fruits et légumes,,ail 3 têtes cal 40/60 bio - le lot de 3 têtes,Ail 3 têtes Cal 40/60 Bio - Le lot de 3 têtes,"3,19 €","(2,13 € 100 g)","5,39 €",Naturalia,https://m.media-amazon.com/images/I/41FQXffsq5...,https://primenow.amazon.fr/dp/B07VMKYS89?qid=1...,B07VMKYS89,"[Fruits et légumes, Légumes frais]"
29,2.488,g,1.99,0.80,Légumes frais,Fruits et légumes,,naturalia echalotes 250g 20/40 - le filet,NATURALIA Echalotes 250G 20/40 - Le filet,"1,99 €","(0,80 € 100 g)","5,39 €",Naturalia,https://m.media-amazon.com/images/I/317DjsTqsZ...,https://primenow.amazon.fr/dp/B07WQL2T5R?qid=1...,B07WQL2T5R,"[Fruits et légumes, Légumes frais]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3140,1.991,g,2.19,1.10,Traiteur apéritif et tartinables,Apéritif,,blini tzatziki - le pot de 200g,Blini Tzatziki - Le pot de 200g,"2,19 €","(1,10 € 100 g)","8,45 €",Monoprix,https://m.media-amazon.com/images/I/41Yn7L8vDY...,https://primenow.amazon.fr/dp/B0756CPK2Q?qid=1...,B0756CPK2Q,"[Apéritif, Traiteur apéritif et tartinables]"
3141,2.493,g,3.49,1.40,Traiteur apéritif et tartinables,Apéritif,,yarden houmous extra avec pignons à l'huile d'...,Yarden Houmous extra avec pignons à l'huile d'...,"3,49 €","(1,40 € 100 g)","8,45 €",Monoprix,https://m.media-amazon.com/images/I/71Bo01dhg0...,https://primenow.amazon.fr/dp/B07DX91CX3?qid=1...,B07DX91CX3,"[Apéritif, Traiteur apéritif et tartinables]"
3142,1.201,g,4.49,3.74,Traiteur apéritif et tartinables,Apéritif,,monoprix gourmet mini involtini ricotta et spe...,Monoprix Gourmet Mini Involtini ricotta et Spe...,"4,49 €","(3,74 € 100 g)","8,45 €",Monoprix,https://m.media-amazon.com/images/I/41zZoMYhE8...,https://primenow.amazon.fr/dp/B07DX939XL?qid=1...,B07DX939XL,"[Apéritif, Traiteur apéritif et tartinables]"
3145,0.899,g,2.49,2.77,Traiteur apéritif et tartinables,Apéritif,,"monoprix gourmet artichonade, préparation à ba...","Monoprix Gourmet Artichonade, préparation à ba...","2,49 €","(2,77 € 100 g)","8,45 €",Monoprix,https://m.media-amazon.com/images/I/41jP2opLX1...,https://primenow.amazon.fr/dp/B07DJ7QZHB?qid=1...,B07DJ7QZHB,"[Apéritif, Traiteur apéritif et tartinables]"
