In [1]:
import pandas as pd
import re
from collections import Counter

In [2]:
pd.options.display.max_colwidth = 100
pd.set_option("display.max_rows", 2000)

In [3]:
rcp = pd.read_csv("All_recipes.csv", index_col = 0, encoding = 'utf-8')

In [4]:
rcp.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 124581 entries, 0 to 124580
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   file_name            124581 non-null  object 
 1   titel_gerecht        124581 non-null  object 
 2   url                  124581 non-null  object 
 3   keuken               92288 non-null   object 
 4   gang                 102918 non-null  object 
 5   tijdsduur            113507 non-null  object 
 6   datum                124581 non-null  object 
 7   intro                124309 non-null  object 
 8   recepttekst          124033 non-null  object 
 9   plaatser_naam        124579 non-null  object 
 10  plaatser_URL         124579 non-null  object 
 11  plaatser_volgers     124581 non-null  int64  
 12  plaatser_recepten    124581 non-null  int64  
 13  plaatser_kookboeken  124581 non-null  int64  
 14  ingrediënten         124581 non-null  object 
 15  tags             

In [5]:
rcp['titel_gerecht'].head()

0    Zoete-aardappelkoekjes van Ottolenghi
1          Scampi's met schorsenerencrÃ¨me
2     gekookte muizen met gebakken slakken
3                     Koffie-advocaatcoupe
4                          Tomatentapenade
Name: titel_gerecht, dtype: object

In [6]:
def fix_encoding(s):
    s = s.encode('cp1252', errors="replace").decode('utf-8', errors='replace')
    return s

In [7]:
rcp_c = pd.DataFrame()

In [8]:
rcp_c["titel_gerecht"] = rcp["titel_gerecht"].apply(lambda s: fix_encoding(s)).str.lower()

In [9]:
replace_dict = {'Ingrediënten\s\d{1,2}\spersonen':'','Ingrediënten': ''}


def clean_ingredients(s): 
    
    #replace character multiples with something
    ingr_dict = {
        '\n': ' ',
        'Ã¨': 'e',
        'Ã®':'i',
        'Ã¯':'i',
        'Â': '',
        'Ã©':'e',
        'Â':'x',
        'Ã¡':'x',
        'Ã«':'e',
        'â' : '',
        '¢' : '',
        'Ã¯': 'i',
        'crÃ¨me': 'creme',
        'fraÃ®che':'fraiche',
        '•':''
        }
    for k in ingr_dict:
        s = s.replace(k, ingr_dict[k])
    
    #remove selected characters
    chars = '!@#$~%^&*\'()`_+<>?:.·\\|,";-=/[]{}�'
    for c in s:
        if c in chars:
            s = s.replace(c, " ")    
    
    #remove digits
    digits = re.compile('\d*')
    s = digits.sub('', s)    
    
    #reduces whitespace to one
    multiple_spaces = re.compile('\s+')
    s = multiple_spaces.sub(' ', s)
    s.strip()
    i = s.lower()
    
    return i

stopwords_to_remove = pd.read_csv("ReferenceLists/remove.csv", index_col = 0, encoding = 'utf-8', header=None)
remove_pattern = r'\b(?:{})\b'.format('|'.join(stopwords_to_remove.index))

a_to_remove = pd.read_csv("ReferenceLists/adjectives.csv", index_col = 0, encoding = 'utf-8', header=0)
remove_pattern_a = r'\b(?:{})\b'.format('|'.join(a_to_remove.index))

i_to_remove = pd.read_csv("ReferenceLists/ingredients.csv", index_col = 0, encoding = 'utf-8', header=0)
remove_pattern_i = r'\b(?:{})\b'.format('|'.join(i_to_remove.index))



rcp_c['ingrediënten'] = rcp['ingrediënten'].replace(regex = replace_dict).apply(lambda s: fix_encoding(s)).apply(lambda s: clean_ingredients(s)).str.replace(remove_pattern, '').str.replace(remove_pattern_a, '').str.replace(remove_pattern_i, '')

In [10]:
rcp_c[0:500]

Unnamed: 0,titel_gerecht,ingrediënten
0,zoete-aardappelkoekjes van ottolenghi,geschilde zoete fijne kristalsuiker bosui smaak royaal bakken saus...
1,scampi's met schorsenerencrème,scampi schorseneren rucolahandje
2,gekookte muizen met gebakken slakken,gekookte tomaatjes sla komkommer kruidenboter radijsjes mayonaise tube
3,koffie-advocaatcoupe,hete koffie bolletjes vanille chocoladeijs stijfgeslagen advocaat chocolait chips nestle
4,tomatentapenade,pesto
5,kaas-uientaart,deeg koude eidooiers vulling ontbijtspek leerdammer creme fraiche
6,ei-avocadosalade met rauwe ham,sesamzaad avocado olijven piment rauwe ham veldsla rucola dressing wijnazijn ...
7,knolselderij met pesto,knolselderij gewassen geschild zonnebloemolie selderij parmezaanse grofgeraspt walnot...
8,gevuld fruit en groenten,cherrytomaatjes huttenkase mayonaise bosuitje tabasco
9,gnocchi met groente,oudbakken volkorenbrood hete wortel venkelknol cayennepeper paneermeel pizzatomaten ...


In [11]:
#FIND MOST COMMON SINGLE WORDS

In [12]:
monogramvocabulary = Counter()

In [13]:
monograms = rcp_c["ingrediënten"].str.split().apply(monogramvocabulary.update)

In [14]:
monogram_freq = pd.DataFrame(monogramvocabulary.most_common(), columns = ['woord', 'freq'])

In [15]:
monogram_freq[0:500]

Unnamed: 0,woord,freq
0,smaak,7254
1,kruiden,5544
2,sap,5535
3,saus,4241
4,creme,4130
5,yoghurt,4056
6,ham,3927
7,basterdsuiker,3862
8,mayonaise,3848
9,gekookte,3815


In [16]:
#FIND MOST COMMON BIGRAMS

In [17]:
def return_bigrams(words):
    bigrams = zip(words, words[1:])
    return bigrams

In [18]:
bigramvocabulary = Counter()
bigrams = rcp_c["ingrediënten"].str.split().apply(return_bigrams).apply(bigramvocabulary.update)

In [19]:
bigram_freq = pd.DataFrame(bigramvocabulary.most_common(), columns = ['bigram', 'freq'])

In [20]:
def concatenate_bigram(t):
    return str(t[0] + "_" + t[1])

In [21]:
bigram_freq["bigram_conc"] = bigram_freq.bigram.apply(concatenate_bigram)

In [22]:
bigram_freq[0:500]

Unnamed: 0,bigram,freq,bigram_conc
0,"(creme, fraiche)",3427,creme_fraiche
1,"(lente, uitjes)",1874,lente_uitjes
2,"(zelfrijzend, bakmeel)",1610,zelfrijzend_bakmeel
3,"(pure, chocolade)",1400,pure_chocolade
4,"(sambal, oelek)",1270,sambal_oelek
5,"(italiaanse, kruiden)",1188,italiaanse_kruiden
6,"(crème, fraîche)",1035,crème_fraîche
7,"(jus, dorange)",961,jus_dorange
8,"(olijven, pit)",865,olijven_pit
9,"(gekookte, ham)",813,gekookte_ham


In [23]:
#FIND MOST COMMON TRIGRAMS

In [24]:
def return_trigrams(words):
    trigrams = zip(words, words[1:], words[2:])
    return trigrams

In [25]:
trigramvocabulary = Counter()
trigrams = rcp_c["ingrediënten"].str.split().apply(return_trigrams).apply(trigramvocabulary.update)

In [26]:
trigram_freq = pd.DataFrame(trigramvocabulary.most_common(), columns = ['trigram', 'freq'])

In [27]:
def concatenate_trigram(t):
    return str(t[0] + "_" + t[1] + "_" + t[2])

In [28]:
trigram_freq["trigram_conc"] = trigram_freq.trigram.apply(concatenate_trigram)

In [29]:
trigram_freq[0:5]

Unnamed: 0,trigram,freq,trigram_conc
0,"(traditioneel, c, selectie)",226,traditioneel_c_selectie
1,"(medium, dry, sherry)",143,medium_dry_sherry
2,"(zelfrijzend, bakmeel, bakpoeder)",139,zelfrijzend_bakmeel_bakpoeder
3,"(lente, uitjes, ringetjes)",135,lente_uitjes_ringetjes
4,"(deeg, hartige, taart)",129,deeg_hartige_taart


In [30]:
# SIEVING N GRAMS

In [31]:
len(monogram_freq), len(bigram_freq), len(trigram_freq)

(49902, 532479, 780147)

In [32]:
monogram_freq.to_csv("monograms.csv", index = True)

In [33]:
bigram_freq.to_csv("bigrams.csv", index = True)

In [34]:
trigram_freq.to_csv("trigrams.csv", index = True)