# ML Datasets for text classification

In [1]:
import pandas as pd
import numpy as np

In [2]:
# ingredients
items_backup = pd.read_csv('/Users/vincentsalamand/Documents/mama_db_backup/mama_items.csv')
validated_items = items_backup.loc[items_backup['is_validated'] == "t"]
df_items = validated_items.name.to_frame()
df_items = df_items.assign(type='ingredient')
df_items = df_items.reset_index()
ingredients = df_items.drop(columns=['index'])
ingredients.rename(columns = {'name':'text'}, inplace = True)
ingredients.tail()

Unnamed: 0,text,type
38869,poivre,ingredient
38870,poivre,ingredient
38871,Poivre,ingredient
38872,poivre,ingredient
38873,poivre,ingredient


In [3]:
# servings
recipes_backup = pd.read_csv('/Users/vincentsalamand/Documents/mama_db_backup/mama_recipes.csv')
servings = recipes_backup.servings.to_frame()
servings = servings.assign(type='serving')
servings = servings.dropna()
servings.rename(columns = {'servings':'text'}, inplace = True)
servings.tail()

Unnamed: 0,text,type
7122,1 personne,serving
7123,4,serving
7124,4,serving
7125,6 personnes,serving
7126,5 personnes,serving


In [4]:
# instructions
recipes_instructions_backup = recipes_backup[recipes_backup['instructions'].notnull()].instructions.to_frame()


data = []
for instruction in recipes_instructions_backup.instructions:
    data.append(instruction.split('\r\n'))

instructions_dict = {
        "text": [item for sublist in data for item in sublist]
        }

instructions = pd.DataFrame(instructions_dict)
instructions = instructions.assign(type='instruction')
instructions.tail()

Unnamed: 0,text,type
47209,Faire chauffer 3 cuillères à soupe d'huile d'o...,instruction
47210,"Ajouter les dés de veau et le jambon, mouiller...",instruction
47211,Couvrir et laisser mijoter pendant 1h30. Penda...,instruction
47212,"Après 1h30 de cuisson, goûter et rectifier l'a...",instruction
47213,Plus l'axoa mijote et meilleur il est donc ne ...,instruction


In [5]:
# Titles
titles = recipes_backup.title.to_frame()
titles = titles.assign(type='title')
titles.rename(columns = {'title':'text'}, inplace = True)

titles.tail()

Unnamed: 0,text,type
7122,Risotto de crozets au potiron et beaufort,title
7123,"Poulet basquaise, recette facile",title
7124,Frittata aux courgettes,title
7125,"Tarte fine aux pêches, citron vert et romarin",title
7126,Axoa de veau du pays basque,title


In [6]:
# Various text
stories = pd.read_csv('/Users/vincentsalamand/Documents/datasets/various_recipe_text.csv')
stories = stories.assign(type='other')
stories.tail()


Unnamed: 0,text,type
5548,J’ai refait ma recette de brioche moelleuse dé...,other
5549,Je n’ai pas pu m’empêcher de la photographier ...,other
5550,"Pour cette recette, j’ai utilisé environ 1/3 d...",other
5551,"La brioche est moelleuse, parfaitement aromati...",other
5552,Elle monte parfaitement pour être très légère ...,other


In [44]:
# Create labeled dataset
frames = [ingredients, servings, instructions, titles, stories]
data = pd.concat(frames)
data = data.reset_index()
data = data.drop(columns=['index'])
# peprocessing / remove empty string and prevent DF 
data.text = data.text.apply(lambda x: x.replace('\n', '').replace('\r', '').replace('\xa0', '').strip().lower())
data = data[data['text'].str.strip().astype(bool)]
data = data.dropna()

print(len(data.dropna()))
print(len(data[data.isnull().any(axis=1)]))

data.iloc[46974].text

104949
0


'couper les tranches de chorizo en 4.'

In [45]:
# export file to disk
data.to_csv('/Users/vincentsalamand/Documents/datasets/label_recipe_text.csv', index = False)


In [46]:
yo = pd.read_csv(r'/Users/vincentsalamand/Documents/datasets/label_recipe_text.csv')


In [47]:
print(len(yo))
print(len(yo[yo.isnull().any(axis=1)]))
print(len(yo.dropna()))
yo[yo.isnull().any(axis=1)]
#yo[yo['type'] == 'instruction'][yo['text'].isnull()]



104949
0
104949


Unnamed: 0,text,type
