In [4]:
# the library used
import numpy as np
import pandas as pd
import pysubgroup as ps
import re
from sklearn.feature_extraction import DictVectorizer
import time
import ast
import string 
import imblearn
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

## Helper Function

In [5]:
def tags_preprocess(tags):
    """
    input: tag string
    output: list of individual tags in the given tag string
    function: preprocess a single tag string 
    """
    tags = tags.replace("'","")
    tags = tags.replace(" ","")
    tags = tags.replace("[","")
    tags = tags.replace("]","")
    tags = tags.split(",")
    tags = [x.lower() for x in tags]
    return tags

In [6]:
def ingredients_preprocess(dataframe):
    """
    input: dataframe
    output: list of distinct ingredient list
    function: preprocess the ingredient columns and return a list of distinct ingredients
    """
    distinct_ingredients = []
    
    for i in range(len(dataframe)):
        ingredients = dataframe.iloc[i]['ingredient']
        
        r = re.compile('[A-Z]{1}[a-zA-Z]+')
        ingredients = str(ingredients)
        #ingredients = ''.join(i for i in ingredients if not i.isdigit())
        ingredients = ingredients.replace("'","")
        ingredients = ingredients.replace(" ","")
        ingredients = ingredients.replace("[","")
        ingredients = ingredients.replace("]","")
        # remove text inside parentheses
        ingredients = re.sub(r'\([^())]*\)',"", ingredients)
        ingredients = ingredients.split(",")
        ingredients = list(filter(r.match, ingredients))
        ingredients = [x.lower() for x in ingredients]
        distinct_ingredients += ingredients
        dataframe.iloc[i]['ingredient'] = ingredients
        
    return list(set(distinct_ingredients))

In [7]:
def get_recipe_countries(countries, data):
    """
    input: list of countries, dataframe
    output: selected dataframe whose recipes is from these countries
    function: select the rows in dataframe whose "tag" value contain one country tag
    """
    # add a new column class 
    drop_index = []
    for i in range(len(data)):
        tags = data.loc[i]["tags"]
        tags = tags_preprocess(tags)
        
        country_same =[l for l in countries if l in tags]
            
        if len(country_same) == 1:
            data.at[i, 'label'] = country_same[0]
        if len(country_same) == 0:
            drop_index.append(i)
        if len(country_same) > 1:
            drop_index.append(i)
            #data.at[i, 'label'] = 'overlap'
            
    # drop the columns which has no season tags
    data = data.drop(data.index[drop_index])
    return data

In [8]:
def convert_to_dict(arr):
    """
    Helper function to convect an array of ingredients to a dictionary
    """
    d={}
    for a in arr:
        d[a]=1
    return d

In [9]:
# extract comment user dataset from original dataset
def extract_com_user(data):
    """
    input: dataframe
    output: a new dataframe with all the comment user information
    function: spilt the dictionary of the column 'comment user' in original dataset
    """
    df_com = pd.DataFrame()
    for index, item in data['comment_user'].iteritems():
        if (item != '[]'):
            if (item != 'no comment'):
                array = ast.literal_eval(item)
                df_array = pd.DataFrame(array)
                df_array['recipe_id'] = index
                df_com = pd.concat([df_com,df_array])
    return df_com

In [10]:
def sub_cat_in_com(data):
    """
    input: dataframe
    output: a new dataframe with multi colunms
    function: add one subcategory of recipes to comment user dataset 
    """    
    punct = set(string.punctuation) 
    list_sub_cat = []
    
    df_com2 = pd.DataFrame()
    for index, item in data['calorie_value'].iteritems(): 
        if (item != None):
            list_sub = list(item)
            list_sub = ''.join(x for x in list_sub if x not in punct)
            list_sub_cat.append(list_sub)
    df_sub_cat = pd.DataFrame(list_sub_cat)
    df_sub_cat['calorie_value'] = df_sub_cat
    
    df_sub_cat['recipe_id'] = data['calorie_value'].index     
    df_com2 = pd.concat([df_com2,df_sub_cat])
    return df_com2

In [11]:
def age_group(age):   
    """
    input: age value
    output: group description
    function: divide age value uinto 5 groups 
    """   
    bucket = None
    age = int(age)    
    if age < 30:
        bucket = '<30 Jahre'    
    if age in range(30, 41):
        bucket = '30-40 Jahre'        
    if age in range(40, 51):
        bucket = '40-50 Jahre'        
    if age in range(50, 61):
        bucket = '50-60 Jahre'
    if age >= 61:
        bucket = '60+ Jahre'
    return bucket

In [12]:
def calorie_level(calorie):   
    """
    input: calorie value
    output: group description
    function: divide calorie value into 3 groups 
    """   
    bucket = None
    calorie = int(calorie)    
    if calorie < 300:
        bucket = 'low_calorie'    
    if calorie in range(300, 500):
        bucket = 'medium_calorie'        
    if calorie >= 500:
        bucket = 'high_calorie'      
    return bucket

In [13]:
def remove_None(data, name):
    """
    Helper function to remove None value in one column
    """ 
    y = data[data[name] == 'None']
    index_n = y.index.tolist()
    data = data.drop(index = index_n)
    return data

In [14]:
def add_target(data, calorie_level):
    df_sub_group[country] = df_dum_car[calorie_level]
    return df_sub_group

## Subgroup Discovery

In this section we will use subgroup discovery to explore the association rules between attributes

- why we choose subgroup discovery?

because we find out that subgroup discovery is quite powerful compared to other data mining techniques. As long as we set differnt target with different search space, we can use use subgroup discovery technique to dig almost all interesting pattern that we want to explore from the dataset. 


In [15]:
# read the data
data = pd.read_csv("all_data.csv")

In [16]:
data.head()

Unnamed: 0,categorize,recipe_name,tags,avg_score,difficulty,ingredient,rating_count,calorie,preparation_time,comment_user,recipe_url
0,Menüart,"""A bis Z""-Salat","['Salat', 'Gemüse', 'Sommer', 'Vegetarisch', '...",2.67,simpel,"['1m.-große', 'Zucchini', '1', 'Apfel,säuerlic...",1.0,,15 Min,"[{'rating': 'rating-3', 'comment_time': '24.09...",https://www.chefkoch.de/rezepte/23694213760282...
1,Menüart,"""Aufgehende Sonne""","['Asien', 'Hauptspeise', 'Nudeln', 'Beilage', ...",3.6,normal,"['500g', 'Nudeln(Mie-),ReisnudelnoderReis', '4...",3.0,500kcal,30 Min,"[{'rating': 'rating-5', 'comment_time': '10.08...",https://www.chefkoch.de/rezepte/18838313064020...
2,Regional,"""Black and White"" New York Cheesecake","['Backen', 'USAoderKanada', 'Kuchen']",0.0,normal,"['250g', 'Bitterschokolade,70%', '300g', 'Schl...",0.0,,60 Min,,https://www.chefkoch.de/rezepte/36158315436795...
3,Menüart,"""Bleib gesund""-Smoothie SuperNova","['Getränk', 'Vegetarisch', 'Frühstück', 'kalor...",4.03,simpel,"['1m.-große', 'Karotte(n)', '1', 'Apfel', '1St...",84.0,205kcal,5 Min,"[{'rating': 'rating-1', 'comment_time': '13.01...",https://www.chefkoch.de/rezepte/20464013311985...
4,Menüart,"""Bottermelk Anballerste""","['Hauptspeise', 'Europa', 'Suppe', 'gebunden',...",3.6,normal,"['500g', 'Kartoffel(n)', '1', 'Zwiebel(n)', '1...",3.0,,15 Min,"[{'rating': 'rating-3', 'comment_time': '10.11...",https://www.chefkoch.de/rezepte/33875015040182...


### Association rules between comment user information and recipe attributes

- associations with "calorie" and comment user information

- data imbalance: gender
- improve the quality of data (preprossing)
- sampling 
- improne quality of result: different search algorithmn and selectors choices
- if the quality is also very low, plot the t-SNE

In [17]:
pat = r"([-+]?\d*\.\d+|\d+)"
data["calorie_value"] = data["calorie"].str.extract(pat, flags=0, expand=True)
data.head()

Unnamed: 0,categorize,recipe_name,tags,avg_score,difficulty,ingredient,rating_count,calorie,preparation_time,comment_user,recipe_url,calorie_value
0,Menüart,"""A bis Z""-Salat","['Salat', 'Gemüse', 'Sommer', 'Vegetarisch', '...",2.67,simpel,"['1m.-große', 'Zucchini', '1', 'Apfel,säuerlic...",1.0,,15 Min,"[{'rating': 'rating-3', 'comment_time': '24.09...",https://www.chefkoch.de/rezepte/23694213760282...,
1,Menüart,"""Aufgehende Sonne""","['Asien', 'Hauptspeise', 'Nudeln', 'Beilage', ...",3.6,normal,"['500g', 'Nudeln(Mie-),ReisnudelnoderReis', '4...",3.0,500kcal,30 Min,"[{'rating': 'rating-5', 'comment_time': '10.08...",https://www.chefkoch.de/rezepte/18838313064020...,500.0
2,Regional,"""Black and White"" New York Cheesecake","['Backen', 'USAoderKanada', 'Kuchen']",0.0,normal,"['250g', 'Bitterschokolade,70%', '300g', 'Schl...",0.0,,60 Min,,https://www.chefkoch.de/rezepte/36158315436795...,
3,Menüart,"""Bleib gesund""-Smoothie SuperNova","['Getränk', 'Vegetarisch', 'Frühstück', 'kalor...",4.03,simpel,"['1m.-große', 'Karotte(n)', '1', 'Apfel', '1St...",84.0,205kcal,5 Min,"[{'rating': 'rating-1', 'comment_time': '13.01...",https://www.chefkoch.de/rezepte/20464013311985...,205.0
4,Menüart,"""Bottermelk Anballerste""","['Hauptspeise', 'Europa', 'Suppe', 'gebunden',...",3.6,normal,"['500g', 'Kartoffel(n)', '1', 'Zwiebel(n)', '1...",3.0,,15 Min,"[{'rating': 'rating-3', 'comment_time': '10.11...",https://www.chefkoch.de/rezepte/33875015040182...,


In [20]:
data_com = data.dropna(subset=["comment_user",'calorie_value'])
data_com = data_com[0:1800]
data_com.head()
# data_com data_com_com['comment_user']
df_com_1 = extract_com_user(data_com)
df_com_2 = sub_cat_in_com(data_com)
df_com_new = df_com_1.merge(df_com_2, on='recipe_id', how='left')

df_com_new = df_com_new.drop(0,1)
df_com_new = df_com_new[['recipe_id','name','rating','sex','age','marriage_status','job','comment_time','calorie_value']]
df_com_new = df_com_new.set_index(["recipe_id"])
df_com_new.head()


Unnamed: 0_level_0,name,rating,sex,age,marriage_status,job,comment_time,calorie_value
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Haubndaucher,rating-5,männlich,46 Jahre,Verheiratet,Verkäufer.,10.08.2017 14:56,500
1,Lumakath,rating-4,weiblich,49 Jahre,Verheiratet,,19.01.2013 13:19,500
1,esther76,rating-4,weiblich,,Verheiratet,Busfahrerin,21.10.2012 17:56,500
3,t1n4a,rating-1,weiblich,25 Jahre,,,13.01.2020 21:05,205
3,patty89,rating-4,weiblich,,Verheiratet,,03.09.2019 13:18,205


In [21]:
# remove none value in the whole data set 
df_com_new = remove_None(df_com_new, 'age')
df_com_new = remove_None(df_com_new, 'sex')
# df_com_new = remove_None(df_com_new, 'marriage_status')
# df_com_new = remove_None(df_com_new, 'job')
df_com_new = remove_None(df_com_new, 'calorie_value')
df_com_new['sex'].value_counts()
# df_com_new['marriage_status'].value_counts()
# df_com_new['job'].value_counts()
# df_com_new

weiblich    318
männlich     53
Name: sex, dtype: int64

In [25]:
# add calorie level and age group columns in the comment user information
df_com_new['calorie_level'] = df_com_new['calorie_value'].apply(calorie_level)
df_dum_car = pd.get_dummies(df_com_new['calorie_level'])
df_com_age = df_com_new
df_com_age["age_value"] = df_com_age["age"].str.extract(pat, flags=0, expand=True)
df_com_age['age_group'] = df_com_age['age_value'].apply(age_group)
# df_com_car = pd.merge(df_com_new,df_dum_car,left_on='name', right_on=None, how='left')
# df_com_car = pd.merge(df_com_new, df_dum_car)
df_com_car = df_com_new.join(df_dum_car, how='left')

# df_com_new.reset_index(inplace = True)
df_com_car

df_age = df_com_car[['rating','sex','age_group','marriage_status','job','high_calorie','low_calorie','medium_calorie']]
# df_age['sex'].value_counts()


 - resampling the data set because the imbalance data of sex

In [26]:
df_sampling = df_com_age[['rating','sex','age_group','marriage_status','job','calorie_level','calorie_value']]
df_sampling

Unnamed: 0_level_0,rating,sex,age_group,marriage_status,job,calorie_level,calorie_value
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
43,rating-4,weiblich,30-40 Jahre,,,high_calorie,564
43,rating-5,weiblich,50-60 Jahre,Verheiratet,,high_calorie,564
61,rating-5,weiblich,40-50 Jahre,,,high_calorie,786
79,rating-4,weiblich,50-60 Jahre,Verheiratet,Hausfrau,medium_calorie,497
79,rating-4,weiblich,<30 Jahre,,schülerin,medium_calorie,497
...,...,...,...,...,...,...,...
8471,rating-5,männlich,<30 Jahre,Single,Kochazubi,high_calorie,2099
8471,rating-5,weiblich,40-50 Jahre,Verheiratet,Krankenschwester,high_calorie,2099
8471,rating-3,weiblich,30-40 Jahre,,,high_calorie,2099
8486,rating-5,weiblich,60+ Jahre,Verheiratet,Mitarbeiterin bei Chefkoch.de - Community Cont...,high_calorie,2422


In [27]:
# oversampling
# Separate input features and target
y = df_sampling['calorie_level']
X = df_sampling.drop('calorie_level', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
female = df_sampling[df_sampling['sex'] == 'weiblich']
male = df_sampling[df_sampling['sex'] == 'männlich']

# upsample minority
male_upsampled = resample(male,
                          replace=True, # sample with replacement
                          n_samples=len(female), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([female, male_upsampled])
# upsampled['sex'].value_counts()
upsampled.head()

Unnamed: 0_level_0,rating,sex,age_group,marriage_status,job,calorie_level,calorie_value
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
43,rating-4,weiblich,30-40 Jahre,,,high_calorie,564
43,rating-5,weiblich,50-60 Jahre,Verheiratet,,high_calorie,564
61,rating-5,weiblich,40-50 Jahre,,,high_calorie,786
79,rating-4,weiblich,50-60 Jahre,Verheiratet,Hausfrau,medium_calorie,497
79,rating-4,weiblich,<30 Jahre,,schülerin,medium_calorie,497


In [28]:
upsampled.reset_index(inplace = True)

In [29]:
# add the dummyset into the upsampled dataset
upsampled_dum = pd.get_dummies(upsampled['calorie_level'])
upsampled_dum.reset_index(inplace = True)

In [30]:
upsampled_new = upsampled.join(upsampled_dum, how='left')
# upsampled_new = upsampled_new.drop('calorie_level',1)
upsampled_new

Unnamed: 0,recipe_id,rating,sex,age_group,marriage_status,job,calorie_level,calorie_value,index,high_calorie,low_calorie,medium_calorie
0,43,rating-4,weiblich,30-40 Jahre,,,high_calorie,564,0,1,0,0
1,43,rating-5,weiblich,50-60 Jahre,Verheiratet,,high_calorie,564,1,1,0,0
2,61,rating-5,weiblich,40-50 Jahre,,,high_calorie,786,2,1,0,0
3,79,rating-4,weiblich,50-60 Jahre,Verheiratet,Hausfrau,medium_calorie,497,3,0,0,1
4,79,rating-4,weiblich,<30 Jahre,,schülerin,medium_calorie,497,4,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
631,2673,rating-4,männlich,60+ Jahre,,,high_calorie,5964,631,1,0,0
632,6102,rating-4,männlich,50-60 Jahre,Verheiratet,Metzger,low_calorie,259,632,0,1,0
633,5369,rating-5,männlich,50-60 Jahre,Verheiratet,,low_calorie,80,633,0,1,0
634,7799,rating-5,männlich,50-60 Jahre,Verheiratet,Pensionär,medium_calorie,448,634,0,0,1


In [31]:
# undersampling
# downsample majority
female_downsampled = resample(female,
                                replace = False, # sample without replacement
                                n_samples = len(male), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([female_downsampled, male])
downsampled

Unnamed: 0_level_0,rating,sex,age_group,marriage_status,job,calorie_level,calorie_value
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6499,rating-4,weiblich,50-60 Jahre,Verheiratet,Freiberufliche Übersetzerin,medium_calorie,363
5500,rating-5,weiblich,<30 Jahre,,,medium_calorie,461
8366,rating-4,weiblich,40-50 Jahre,Vergeben,Bürostuhlakrobat,high_calorie,959
79,rating-4,weiblich,50-60 Jahre,Verheiratet,Hausfrau,medium_calorie,497
8486,rating-5,weiblich,60+ Jahre,Verheiratet,Mitarbeiterin bei Chefkoch.de - Community Cont...,high_calorie,2422
...,...,...,...,...,...,...,...
8252,rating-4,männlich,30-40 Jahre,,,low_calorie,275
8376,rating-4,männlich,40-50 Jahre,Vergeben,,high_calorie,692
8377,rating-4,männlich,60+ Jahre,,Rentner,medium_calorie,413
8419,rating-5,männlich,40-50 Jahre,Verheiratet,,high_calorie,639


In [32]:
# add the dummyset into the downsampled dataset
downsampled_dum = pd.get_dummies(downsampled['calorie_level'])
# downsampled_dum.reset_index(inplace = True)
# downsampled.reset_index(inplace = True)
downsampled_new = downsampled.join(downsampled_dum, how='left')
# downsampled_new = downsampled_new.drop('calorie_level',1)
# downsampled_new['sex'].value_counts()
downsampled_new

Unnamed: 0_level_0,rating,sex,age_group,marriage_status,job,calorie_level,calorie_value,high_calorie,low_calorie,medium_calorie
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
79,rating-4,weiblich,50-60 Jahre,Verheiratet,Hausfrau,medium_calorie,497,0,0,1
251,rating-3,weiblich,<30 Jahre,,,low_calorie,243,0,1,0
251,rating-3,weiblich,<30 Jahre,,,low_calorie,243,0,1,0
251,rating-4,weiblich,50-60 Jahre,,"Ja, sowas hab ich nicht mehr!",low_calorie,243,0,1,0
251,rating-4,weiblich,50-60 Jahre,,"Ja, sowas hab ich nicht mehr!",low_calorie,243,0,1,0
...,...,...,...,...,...,...,...,...,...,...
8471,rating-3,weiblich,30-40 Jahre,,,high_calorie,2099,1,0,0
8471,rating-3,weiblich,30-40 Jahre,,,high_calorie,2099,1,0,0
8471,rating-5,männlich,<30 Jahre,Single,Kochazubi,high_calorie,2099,1,0,0
8471,rating-5,männlich,<30 Jahre,Single,Kochazubi,high_calorie,2099,1,0,0


#### subgroup discoverary

In [33]:
import pysubgroup as ps

target = ps.BinaryTarget('high_calorie', True)
print(target)
searchspace = ps.create_selectors(downsampled_new, ignore=['high_calorie','medium_calorie','low_calorie','rating','job','recipe_id'])
task = ps.SubgroupDiscoveryTask (
    downsampled_new, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=2, 
    qf=ps.WRAccQF())
result = ps.GeneralisingBFS().execute(task)
# result = ps.BestFirstSearch().execute(task)
# pd.set_option('display.max_rows'None)
pd.set_option('max_colwidth',100)
print(result.to_dataframe())

T: high_calorie==True
0.19083272461650838 calorie_level=='high_calorie' OR calorie_value=='1347'
0.19083272461650838 calorie_level=='high_calorie' OR calorie_value=='1426'
0.19083272461650838 calorie_level=='high_calorie' OR calorie_value=='2099'
0.19083272461650838 calorie_level=='high_calorie'
0.19083272461650838 calorie_level=='high_calorie' OR calorie_value=='2422'
discarded [0, 90, 15, 0, 0, 0, 0]
    quality                                             description
0  0.190833  calorie_level=='high_calorie' OR calorie_value=='1347'
1  0.190833  calorie_level=='high_calorie' OR calorie_value=='1426'
2  0.190833  calorie_level=='high_calorie' OR calorie_value=='2099'
3  0.190833                           calorie_level=='high_calorie'
4  0.190833  calorie_level=='high_calorie' OR calorie_value=='2422'


In [34]:
downsampled_new['calorie_value']
target = ps.NumericTarget(downsampled_new['calorie_value'])
searchspace = ps.create_selectors(downsampled_new, ignore=['high_calorie','medium_calorie','low_calorie','rating','job','recipe_id','calorie_value'])
task = ps.SubgroupDiscoveryTask (
    data=downsampled_new, 
    target=target, 
    search_space=searchspace, 
    result_set_size=5, 
    depth=3, 
    qf=ps.WRAccQF())
# result = ps.GeneralisingBFS().execute(task)
result = ps.DFSNumeric().execute(task)
# pd.set_option('display.max_rows'None)
# pd.set_option('max_colwidth',100)
# print(result.to_dataframe())




KeyError: recipe_id
79       497
251      243
251      243
251      243
251      243
        ... 
8471    2099
8471    2099
8471    2099
8471    2099
8486    2422
Name: calorie_value, Length: 148, dtype: object

In [35]:
result.to_dataframe()

Unnamed: 0,quality,description
0,0.190833,calorie_level=='high_calorie' OR calorie_value=='1347'
1,0.190833,calorie_level=='high_calorie' OR calorie_value=='1426'
2,0.190833,calorie_level=='high_calorie' OR calorie_value=='2099'
3,0.190833,calorie_level=='high_calorie'
4,0.190833,calorie_level=='high_calorie' OR calorie_value=='2422'


In [52]:

# target = ps.NumericTarget(downsampled_new['calorie_value'])
target = ps.NumericTarget('calorie_value')
searchspace = ps.create_selectors(downsampled_new, ignore=['high_calorie','medium_calorie','low_calorie','rating','job','recipe_id','calorie_value'])
task = ps.SubgroupDiscoveryTask (
    data=downsampled_new, 
    target=target, 
    search_space=searchspace, 
    result_set_size=5, 
    depth=3, 
    qf=ps.WRAccQF())
# d = downsampled_new['calorie_value']
# task.data.sort_values(target.get_attributes(), ascending=False)
# target.get_attributes()
# task.data.sort_values(task.target.get_attributes(), ascending=False)
# target.get_attributes()
# result = ps.GeneralisingBFS().execute(task)
result = ps.DFSNumeric().execute(task)
# pd.set_option('display.max_rows'None)
# pd.set_option('max_colwidth',100)
# print(result.to_dataframe())


AssertionError: 

### Association rules among the recipe attributes

- the association with "ingredient" and "regions"

In [21]:
countries = ['china', 'indien', 'deutschland','frankreich','grossbritannien','österreich','usaoderkanada','italien','spanien',
             'portugal', 'japen','schweiz','türkei', 'thailand', 'russland', 'großbritannien & irland', 'vietnam', 'korea',
            'australien', 'ägypten', 'marokko', 'niederlande']

In [22]:
df_certain_countries = get_recipe_countries(countries, data)
df_certain_countries.reset_index(inplace = True)
df_certain_countries['label'].value_counts()

deutschland      4288
italien          3035
usaoderkanada    1410
österreich       1062
frankreich       1023
spanien           675
china             667
indien            628
thailand          559
türkei            501
portugal          357
schweiz           341
marokko           174
russland          162
niederlande       108
australien         93
korea              65
vietnam            56
ägypten            51
Name: label, dtype: int64

In [23]:
# drop all the columns other than "ingredient" and "label" column
columns_drop = ['index', 'categorize', 'recipe_name', 'tags', 'avg_score', 'difficulty','rating_count', 'calorie', 
                'preparation_time','comment_user', 'recipe_url']
df_certain_countries = df_certain_countries.drop(columns_drop, axis = 1)

# preprocess the ingredient column
distinct_ingredients = ingredients_preprocess(df_certain_countries)

# One hot encoding of the ingredients
df_certain_countries['ingredient'] = df_certain_countries.ingredient.apply(convert_to_dict)
vectorizer = DictVectorizer(sparse=False)
new_df_countries = pd.DataFrame(data = vectorizer.fit_transform(df_certain_countries.ingredient.tolist()), columns = distinct_ingredients)
new_df_countries['label'] = df_certain_countries.label

# dummy for the label column
new_df_countries = pd.get_dummies(new_df_countries)

In [24]:
# apply 
new_df_countries

Unnamed: 0,bitterschokoladeoderhalbbitterkuvertüre,blütenhonig,anissamen,weißweinzumablöschen,krautsalat,butterschmalzzumanbraten,steakausderlende,crémefraîcheoderschmand,sauerkrautodereinegroßedose,fontinaoderbelpaese,...,label_portugal,label_russland,label_schweiz,label_spanien,label_thailand,label_türkei,label_usaoderkanada,label_vietnam,label_ägypten,label_österreich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
15251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
15252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
15253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0


In [25]:
test = new_df_countries.iloc[:,]
test = test.iloc[:4]

In [26]:
country_label = list(new_df_countries.iloc[:,-19:].columns)
#contry_label.remove('label_deutschland')

# record the start time
time1 = time.time()
        
# apply subgroup discovery 
target = ps.BinaryTarget ('label_usaoderkanada', True)
searchspace = ps.create_selectors(test, ignore = country_label)
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=20, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

# record the end time
time2 = time.time()
time_diff = (time2-time1)/60
print('it took ' + str(time_diff) + 'miniutes to execute the subgroup disc')

KeyError: 'label_usaoderkanada'