In [5]:
import requests
import pandas as pd
from io import BytesIO
from zipfile import ZipFile

def download_and_extract_csv(zip_url):
    # Download the ZIP file
    response = requests.get(zip_url)
    response.raise_for_status()  # This will raise an error if the download failed

    # Extract the CSV file
    with ZipFile(BytesIO(response.content)) as thezip:
        # Assuming there's only one CSV file in the zip
        with thezip.open(thezip.namelist()[0]) as thefile:
            df = pd.read_csv(thefile)

    return df

def download_csv(csv_url):
    response = requests.get(csv_url)
    response.raise_for_status()

    # Qui puoi modificare 'sep' o aggiungere altri parametri se necessario
    try:
        df = pd.read_csv(BytesIO(response.content), sep=',')
    except pd.errors.ParserError:
        df = pd.read_csv(BytesIO(response.content), sep=';', on_bad_lines='skip')

    return df

In [3]:
recipes_df = download_and_extract_csv("https://gitlab.com/felix134/connected-recipe-data-set/-/raw/master/data/hummus_data/preprocessed/pp_recipes.zip")

  df = pd.read_csv(thefile)


In [6]:
ingredients_df = download_csv("https://raw.githubusercontent.com/swapUniba/FoodPrintDB-Database-Completion/main/SuEatableLife%20Database/CSEL_dataset/cfp_wfp_ingredients.csv")

In [7]:
def get_recipes_index(recipes, recipes_df):
    titles_list = recipes_df['title'].tolist()

    index_list = [titles_list.index(title) for title in recipes]

    return index_list

In [8]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.tag import pos_tag

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))

def rimuovi_parentesi(testo):
    return re.sub(r'\([^)]*\)', '', testo)

def remove_stopwords_and_adjectives(text):
    text_without_brackets = rimuovi_parentesi(text)
    tokens = wordpunct_tokenize(text_without_brackets)
    tagged_words = pos_tag(tokens)
    filtered_words = [word.replace('*', '') for word, tag in tagged_words if word.lower() not in stop_words and tag not in ['JJ', 'JJR', 'JJS']]
    if len(filtered_words) > 1:
        return ' '.join(filtered_words)
    else:
        return ''.join(filtered_words)

def clean_ingredients_dataframe(df):
    # Applica la rimozione delle stopwords e degli aggettivi alla colonna 'Food commodity ITEM'
    return df['Food commodity ITEM'].apply(remove_stopwords_and_adjectives)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [9]:
import ast

def get_all_unique_ingredients(recipes_df):
    unique_ingredients = []

    for _, row in recipes_df.iterrows():
        # Assumendo che 'ingredients' sia una colonna in recipes_df
        # che contiene gli ingredienti in formato stringa o lista
        for ing in row['ingredients']:
          unique_ingredients.append(ing)

    return unique_ingredients

def get_dict_name_occ(ings_list):
  dizionario_occorrenze = {}

  for elemento in ings_list:
      if elemento in dizionario_occorrenze:
          dizionario_occorrenze[elemento] += 1
      else:
          dizionario_occorrenze[elemento] = 1

  return dizionario_occorrenze


In [10]:
def create_dict_ing_cfp_wfp(recipes_df, ingredients_df):
    missing_ing = {'name': [], 'count': []}
    present_ing = {'name': [], 'count': []}
    # Utilizzo della nuova funzione per ottenere tutti gli ingredienti unici
    unique_ingredients = get_all_unique_ingredients(recipes_df)
    dict_occ = get_dict_name_occ(unique_ingredients)

    for ing, occ in dict_occ.items():
        present = False

        for index, row in ingredients_df.iterrows():
            if row['Food commodity ITEM'] in ing.upper():
              present = True
              present_ing['name'].append(ing)
              present_ing['count'].append(occ)

        if present is False:
          missing_ing['name'].append(ing)
          missing_ing['count'].append(occ)

    return missing_ing,present_ing


In [11]:
import ast
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

# Definire la lista delle stop words
stop_words = set(stopwords.words('english'))

def clean_ingredient(ingredient):
    # Rimuovere numeri, caratteri speciali e il contenuto tra parentesi tonde (compreso)
    ingredient = re.sub(r'\([^)]*\)|[0-9]+|\W+', ' ', ingredient)

    # Tokenizza la frase
    words = word_tokenize(ingredient)

    # POS tagging
    tagged_words = pos_tag(words)

    # Filtra le parole eliminando stop words, verbi, aggettivi e mantenendo solo i sostantivi
    filtered_words = [word for word, tag in tagged_words if (tag.startswith('NN') or tag.startswith('JJ')) and word.lower() not in stop_words]

    # Ricostruisci l'ingrediente pulito
    cleaned_ingredient = ' '.join(filtered_words)

    return cleaned_ingredient

def clean_recipe_ingredients_dataframe(recipes_df):
    for index, row in recipes_df.iterrows():
        ingredients_field = row['ingredients']
        # Verifica se il campo ingredients è NaN o vuoto
        if pd.isna(ingredients_field) or ingredients_field == '':
            continue  # Salta la riga se non ci sono ingredienti

        # Valutazione letterale del campo stringa in un dizionario Python
        try:
            ingredients_data = ast.literal_eval(ingredients_field)
        except ValueError:
            # Se non è possibile valutare letteralmente, continua con la prossima iterazione
            continue

        cleaned_ingredients = []

        for value in ingredients_data.values():
            for item in value:
                ingredient = item[0]  # Prende solo il nome dell'ingrediente, non la quantità
                cleaned_ingredient = clean_ingredient(ingredient)
                cleaned_ingredients.append(cleaned_ingredient)

        # Aggiorna la colonna ingredients del DataFrame esistente
        recipes_df.at[index, 'ingredients'] = cleaned_ingredients

    return recipes_df


def clean_recipes_dataframe(recipes_df):
    # Rimozione delle ricette senza ingredienti
    recipes_df_cleaned = recipes_df.dropna(subset=['ingredients'])
    recipes_df_cleaned = recipes_df_cleaned[recipes_df_cleaned['ingredients'] != '']

    # Rimozione dei duplicati in base al titolo
    recipes_df_cleaned = recipes_df_cleaned.drop_duplicates(subset='title', keep='first')

    # Mantenimento solo delle ricette con tag validi
    recipes_df_cleaned = recipes_df_cleaned[recipes_df_cleaned['tags'].notna() & (recipes_df_cleaned['tags'] != '')]

    return recipes_df_cleaned

recipes_df_cleaned=clean_recipes_dataframe(recipes_df)

# Utilizzo della funzione
recipes_df_cleaned = clean_recipe_ingredients_dataframe(recipes_df_cleaned)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Utente\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
ingredients_df = ingredients_df.dropna()
ingredients_df['Food commodity ITEM'] = clean_ingredients_dataframe(ingredients_df)

In [13]:
missing_ings_dict,present_ings_dict = create_dict_ing_cfp_wfp(recipes_df_cleaned, ingredients_df)

In [14]:
df = pd.DataFrame.from_dict(missing_ings_dict)
df.head()

Unnamed: 0,name,count
0,shrimp count good,1
1,cilantro,2636
2,jalapeno seeds,23
3,lamb,159
4,mint flakes,10


In [15]:
len(df)

104524

In [16]:
len(recipes_df_cleaned)

214800

In [17]:
df.to_csv('ing_occ.csv', index=False)

In [18]:
df_ordinato = df.sort_values(by='count', ascending=False)

In [19]:
df_ordinato.head(20)

Unnamed: 0,name,count
9,salt,77570
16,sugar,54662
18,flour,43103
26,butter,42812
44,water,29556
29,milk,24654
57,eggs,22608
70,brown sugar,16065
60,powder,15901
56,soda,12749


In [20]:
ingredients_df.to_csv('CSEL_df_cleaned.csv', index=False)