In [140]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import spacy
from spacy.lang.fr import French
from spacy.tokenizer import Tokenizer
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
from textblob import TextBlob
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

In [141]:
measurements = {"g": ["gr.", "gr", "g", "g.", "grams", "gramme"], "kg": ["kg.", "kg", "kilogramme"], "mg": ["mg.", "mg", "miligramme"],
                "ml": ["ml.", "ml", "milliliters", "mililitre"], "cl": ["cl.", "cl", "centilitre"], "l": ["l.", "l", "litre"],
                "cuillère à soupe": ["c. à soupe", "cuillère à soupe", "cs", "cas", "càs", "c. à table", "cuillère à table"],
                "cuillère à café": ["c. à café", "cuillère à café", "cc", "cac", "càc", "c. à thé", "cuillère à thé"],
                "tasse": ["tasse"], "bol": ["bol"], "verre": ["verre"], "filet": ["filet"], "zeste": ["zeste"], "pièce": ["pièce", "unité", "ampollas", "set"],
                "noisette": ["noisette"], "noix": ["noix"], "pincée": ["pincée"], "pointe":["pointe"], "poignée": ["poignée"],
                "feuille": ["feuille"], "branche": ["branche"], "gousse": ["gousse"], "tranche": ["tranche"], "cube": ["cube"],
                "boîte": ["boîte"], "barquette": ["barquette"], "pot": ["pot"], "bâtonnet": ["bâtonnet"], "boule": ["boule"],
                "rouleau": ["rouleau"], "p": ["p"]}

units = [item for sublist in measurements.values() for item in sublist]

quantities = { "½": 0.5, "1/2": 0.5, "1½": 1.5, "1 1/2": 0.5, "⅓": 0.33, "1/3": 0.33, "¼": 0.25, "1/5": 0.2, "¼": 0.25, "un demi": 0.5, "une demi": 0.5,"un et demi": 1.5,
              "tiers": 0.33, "quart": 0.25, "zero": 0, "deux": 2, "trois": 3, "quatre": 4,"cinq": 5, "six": 6, "sept": 7,
              "huit": 8, "neuf": 9, "dix": 10, "onze": 11, "douze": 12, "treize": 13, "quatorze": 14, "quinze": 15, "seize": 16,
              "dix-sept": 17, "dix-huit": 18, "dix-neuf": 19, "vingt": 20, "trente": 30, "quarante": 40, "cinquante": 50,
              "soixante": 60, "soixante-dix": 70, "quatre-vingt": 80, "quatre-vingt-dix": 90, "dizaine": 10, "une dizaine": 10,
              "douzaine": 12, "une douzaine": 12, "demi-douzaine": 6, "une demi-douzaine": 6, "vingtaine": 20, "une vingtaine": 20,
              "trentaine": 30, "quarantaine": 40, "cinquantaine": 50, "centaine": 100, "une centaine": 100, "cent": 100,
              "un": 1, "une": 1}

char_list = [i for i in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZéèàâêïù▪️"]


In [142]:
web_product_data = pd.read_csv ('/Users/vincentsalamand/Downloads/products_amazonprimenow.csv')
print(len(web_product_data))

4662


In [143]:
data = pd.DataFrame()
data['quantity_match'] = ""
data['unit_match'] = ""
data['price_match'] = ""
data['price_per_unit_match'] = ""
data = pd.concat([data, web_product_data], axis=1)
data['shelter'] = ""



In [144]:
# clean prices
data['price_match'] = data.price.apply(lambda x: float(x[:-1].replace(',','.')))
data['price_per_unit_match'] = data.price_per_unit.apply(lambda x: float(x[1:-1].replace(',', '.').split()[0]))
data.quantity_match = round(data.price_match / data.price_per_unit_match, 3)

data['price_per_unit'] = data.price_per_unit.apply(lambda x: x.replace('/', ' ').lower())


In [145]:
# match description to find unit
nlp = French()

def convert_unit(value):
    results = []
    result = [unit for unit, item in measurements.items() if value in item]
    if len(result) > 0:
        return result[0]

measurement_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
measurement_patterns = list(nlp.tokenizer.pipe(list(units)))
measurement_matcher.add("MEASUREMENT_PATTERN", None, *measurement_patterns)
tokenizer = Tokenizer(nlp.vocab)

measurement_parser = []
price_per_unit = list(nlp.pipe(data.price_per_unit))

for line in price_per_unit:
    line = tokenizer(re.sub('[0-9]', ' ', line.text))
    matches = measurement_matcher(line)
    elements = []
    if len(matches) > 0:
        for match_id, start, end in matches:
            span = line[start:end]
            elements.append(span.text.lower())
        measurement_parser.append(convert_unit(elements[0]))
    else:
        text_blob_object = TextBlob(line.text)
        singular_line = ' '.join(text_blob_object.words.singularize())
        matches = measurement_matcher(tokenizer(singular_line))
        if len(matches) > 0:
            for match_id, start, end in matches:
                span = tokenizer(singular_line)[start:end]
                elements.append(span.text.lower())
            measurement_parser.append(convert_unit(elements[0]))
        else:
            plural_line = ' '.join(text_blob_object.words.pluralize())
            matches = measurement_matcher(tokenizer(plural_line))
            if len(matches) > 0:
                for match_id, start, end in matches:
                    span = tokenizer(plural_line)[start:end]
                    elements.append(span.text.lower())
                measurement_parser.append(convert_unit(elements[0]))
            else:
                 measurement_parser.append("")

data['unit_match'] = measurement_parser

In [146]:

# get shelter list

for index, product in data.iterrows():       
    shelter = []
    shelter.append(data.loc[index].shelter_parent)    
    shelter.append(data.loc[index].shelter_main)
    data.at[index,'shelter'] = shelter


In [148]:
export_csv = data.to_csv (r'/Users/vincentsalamand/Downloads/amazonprimenow_catalog.csv', index = None, header=True)


In [147]:
data

Unnamed: 0,quantity_match,unit_match,price_match,price_per_unit_match,shelter_main,shelter_parent,brand,description1,description,price,price_per_unit,promo_price,store,image_url,url,ean,shelter
0,1.970,g,3.29,1.67,Légumes frais,Fruits et légumes,NATURALIA,courgette bio - lot de 4,NATURALIA Courgette Bio - Lot de 4,"3,29 €","(1,67 € 100 g)","0,85 €",Naturalia,https://m.media-amazon.com/images/I/61Q0wN4Twx...,https://primenow.amazon.fr/dp/B07XVCXSG3?qid=1...,B07XVCXSG3,"[Fruits et légumes, Légumes frais]"
1,1.192,g,1.99,1.67,Légumes frais,Fruits et légumes,NATURALIA,poireau bio - lot de 3,NATURALIA Poireau Bio - Lot de 3,"1,99 €","(1,67 € 100 g)","0,85 €",Naturalia,https://m.media-amazon.com/images/I/51teqWRb45...,https://primenow.amazon.fr/dp/B07XVJQDP5?qid=1...,B07XVJQDP5,"[Fruits et légumes, Légumes frais]"
2,1.731,g,2.89,1.67,Légumes frais,Fruits et légumes,NATURALIA,concombre 4/5 - pièce,NATURALIA Concombre 4/5 - Pièce,"2,89 €","(1,67 € 100 g)","0,85 €",Naturalia,https://m.media-amazon.com/images/I/51G6dgcdco...,https://primenow.amazon.fr/dp/B07WKH6PLP?qid=1...,B07WKH6PLP,"[Fruits et légumes, Légumes frais]"
3,0.500,kg,2.35,4.70,Légumes frais,Fruits et légumes,,oignon jaune 500g bio - le filet de 500g,Oignon jaune 500G Bio - Le filet de 500g,"2,35 €","(4,70 € kg)","0,85 €",Naturalia,https://m.media-amazon.com/images/I/31F7l5fdOv...,https://primenow.amazon.fr/dp/B07VPNG57F?qid=1...,B07VPNG57F,"[Fruits et légumes, Légumes frais]"
4,0.743,kg,3.49,4.70,Légumes frais,Fruits et légumes,NATURALIA,aubergine bio - lot de 2,NATURALIA Aubergine Bio - Lot de 2,"3,49 €","(4,70 € kg)","0,85 €",Naturalia,https://m.media-amazon.com/images/I/51zT6WYja0...,https://primenow.amazon.fr/dp/B07XSP1YNZ?qid=1...,B07XSP1YNZ,"[Fruits et légumes, Légumes frais]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657,142.714,,9.99,0.07,Soin du corps,Hygiène et soin,,le petit marseillais anti-cellulite - 150ml,LE PETIT MARSEILLAIS Anti-Cellulite - 150ml,"9,99 €","(0,07 € milliliters)","5,39 €",Monoprix,https://m.media-amazon.com/images/I/81V9Kv0wZx...,https://primenow.amazon.fr/dp/B00X9D8X9A?qid=1...,B00X9D8X9A,"[Hygiène et soin, Soin du corps]"
4658,0.601,ml,1.49,2.48,Soin du corps,Hygiène et soin,,monoprix dissolvant express sans acétone extra...,Monoprix Dissolvant express sans acétone extra...,"1,49 €","(2,48 € 100 ml)","5,39 €",Monoprix,https://m.media-amazon.com/images/I/31IUbAG06X...,https://primenow.amazon.fr/dp/B017XN3JAE?qid=1...,B017XN3JAE,"[Hygiène et soin, Soin du corps]"
4659,2.303,g,7.99,3.47,Soin du corps,Hygiène et soin,,nivea sun lait après-soleil bronze prolongateu...,Nivea Sun Lait après-soleil Bronze Prolongateu...,"7,99 €","(3,47 € 100 g)","5,39 €",Monoprix,https://m.media-amazon.com/images/I/71mWUzYyOo...,https://primenow.amazon.fr/dp/B01CXA4ZXM?qid=1...,B01CXA4ZXM,"[Hygiène et soin, Soin du corps]"
4660,1.996,ml,4.59,2.30,Soin du corps,Hygiène et soin,Monoprix Bio,lait corps nourrissant peaux sèches à très sèc...,Monoprix Bio Lait corps nourrissant peaux sèch...,"4,59 €","(2,30 € 100 ml)","5,39 €",Monoprix,https://m.media-amazon.com/images/I/316AOE4PGC...,https://primenow.amazon.fr/dp/B07DX8BD87?qid=1...,B07DX8BD87,"[Hygiène et soin, Soin du corps]"
