In [390]:
# the library used
import numpy as np
import pandas as pd
import pysubgroup as ps
import re
from sklearn.feature_extraction import DictVectorizer
import time
import ast
import string 
import imblearn
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression



## Helper Function

In [344]:
def tags_preprocess(tags):
    """
    input: tag string
    output: list of individual tags in the given tag string
    function: preprocess a single tag string 
    """
    #tags = tags.replace("'","")
    #tags = tags.replace(" ","")
    #tags = tags.replace("[","")
    #tags = tags.replace("]","")
    tags = str(tags)
    tags = tags.split(" ")
    tags = [x.lower() for x in tags]
    return tags

In [421]:
def ingredients_preprocess(df):
    """
    input: dataframe
    output: list of distinct ingredient list
    function: preprocess the ingredient columns and return a list of distinct ingredients
    """
    distinct_ingredients = []
    dataframe = df
    
    for i in range(len(dataframe)):
        ingredients = dataframe.iloc[i]['ingredient']
        
        r = re.compile('[A-Z]{1}[a-zA-Z]+')
        ingredients = str(ingredients)
        #ingredients = ''.join(i for i in ingredients if not i.isdigit())
        ingredients = ingredients.replace("'","")
        #ingredients = ingredients.replace(" ","")
        ingredients = ingredients.replace("[","")
        ingredients = ingredients.replace("]","")
        # remove text inside parentheses
        ingredients = re.sub(r'\([^())]*\)',"", ingredients)
        ingredients = ingredients.split(" ")
        ingredients = list(filter(r.match, ingredients))
        ingredients = [x.lower() for x in ingredients]
        distinct_ingredients += ingredients
        dataframe.at[i, 'ingredient'] = ingredients
        #dataframe.set_value(i, 'ingredient', ingredients)
        
    return [list(set(distinct_ingredients)),dataframe]

In [185]:
def get_recipe_countries(countries, data):
    """
    input: list of countries, dataframe
    output: selected dataframe whose recipes is from these countries
    function: select the rows in dataframe whose "tag" value contain one country tag
    """
    # add a new column class 
    drop_index = []
    for i in range(len(data)):
        tags = data.loc[i]["tags"]
        tags = tags_preprocess(tags)
        
        country_same =[l for l in countries if l in tags]
            
        if len(country_same) == 1:
            data.at[i, 'label'] = country_same[0]
        if len(country_same) == 0:
            drop_index.append(i)
        if len(country_same) > 1:
            drop_index.append(i)
            #data.at[i, 'label'] = 'overlap'
            
    # drop the columns which has no season tags
    data = data.drop(data.index[drop_index])
    return data

In [186]:
def convert_to_dict(arr):
    """
    Helper function to convect an array of ingredients to a dictionary
    """
    d={}
    for a in arr:
        d[a]=1
    return d

In [187]:
# extract comment user dataset from original dataset
def extract_com_user(data):
    """
    input: dataframe
    output: a new dataframe with all the comment user information
    function: spilt the dictionary of the column 'comment user' in original dataset
    """
    df_com = pd.DataFrame()
    for index, item in data['comment_user'].iteritems():
        if (item != '[]'):
            if (item != 'no comment'):
                array = ast.literal_eval(item)
                df_array = pd.DataFrame(array)
                df_array['recipe_id'] = index
                df_com = pd.concat([df_com,df_array])
    return df_com

In [188]:
def sub_cat_in_com(data):
    """
    input: dataframe
    output: a new dataframe with multi colunms
    function: add one subcategory of recipes to comment user dataset 
    """    
    punct = set(string.punctuation) 
    list_sub_cat = []
    
    df_com2 = pd.DataFrame()
    for index, item in data['calorie_value'].iteritems(): 
        if (item != None):
            list_sub = list(item)
            list_sub = ''.join(x for x in list_sub if x not in punct)
            list_sub_cat.append(list_sub)
    df_sub_cat = pd.DataFrame(list_sub_cat)
    df_sub_cat['calorie_value'] = df_sub_cat
    
    df_sub_cat['recipe_id'] = data['calorie_value'].index     
    df_com2 = pd.concat([df_com2,df_sub_cat])
    return df_com2

In [189]:
def add_recipe_info(data):
    """
    input: dataframe
    output: a new dataframe with multi colunms
    function: add one subcategory of recipes to comment user dataset 
    """    
    punct = set(string.punctuation) 
    # add recipe name in comment users data
    list_recipe_name = []
    df_recipe = pd.DataFrame()
    for index, item in data['recipe_name'].iteritems(): 
        if (item != None):
            list_name = list(item)
            list_name = ''.join(x for x in list_name if x not in punct)
            list_recipe_name.append(list_name)

    df_name = pd.DataFrame(list_recipe_name)
    df_name['recipe_id'] = data['recipe_name'].index 
    df_name = df_name.set_index(["recipe_id"])
    df_name
    df_recipe['recipe_name'] = df_name[0]

    # add recipe difficulty in comment users data
    list_recipe_diff = []
    df_recipe_diff = pd.DataFrame()
    for index, item in data['difficulty'].iteritems(): 
        if (item != None):
            list_diff = list(item)      
            list_diff = ''.join(x for x in list_diff if x not in punct)   
            list_recipe_diff.append(list_diff)

    df_diff = pd.DataFrame(list_recipe_diff)
    df_diff['recipe_id'] = data['difficulty'].index 
    df_diff = df_diff.set_index(["recipe_id"])
    df_recipe['difficulty'] = df_diff[0]

    # add recipe preparation_time in comment users data
    list_recipe_pre = []
    df_recipe_pre = pd.DataFrame()
    for index, item in data['preparation_time'].iteritems(): 
        if (item != None):
            list_pre = list(item)

            list_pre = ''.join(x for x in list_pre if x not in punct)

            list_recipe_pre.append(list_pre)

    df_pre = pd.DataFrame(list_recipe_pre)
    df_pre['recipe_id'] = data['preparation_time'].index 
    df_pre = df_pre.set_index(["recipe_id"])
    df_recipe['preparation_time'] = df_pre[0]

    # add recipe tags in comment users data
    list_recipe_tags = []
    df_recipe_tags = pd.DataFrame()
    for index, item in data['tags'].iteritems(): 
        if (item != None):
            list_tags = list(item)      
            list_tags = ''.join(x for x in list_tags if x not in punct)   
            list_recipe_tags.append(list_tags)


    df_tags = pd.DataFrame(list_recipe_tags)
    df_tags['recipe_id'] = data['tags'].index 
    df_tags = df_tags.set_index(["recipe_id"])
    df_recipe['tags'] = df_tags[0]

    # add recipe ingredient in comment users data
    list_recipe_ingredient = []
    df_recipe_ingredient = pd.DataFrame()
    for index, item in data['ingredient'].iteritems(): 
        if (item != None):
            list_ingredient = list(item) 
            list_ingredient = ''.join(x for x in list_ingredient if x not in punct)
            list_recipe_ingredient.append(list_ingredient)

    df_recipe_ingredient = pd.DataFrame(list_recipe_ingredient)
    df_recipe_ingredient['recipe_id'] = data['ingredient'].index 
    df_recipe_ingredient = df_recipe_ingredient.set_index(["recipe_id"])
    df_recipe['ingredient'] = df_recipe_ingredient[0]
    return df_recipe

In [190]:
def age_group(age):   
    """
    input: age value
    output: group description
    function: divide age value uinto 5 groups 
    """   
    bucket = None
    age = int(age)    
    if age < 30:
        bucket = '<30 Jahre'    
    if age in range(30, 41):
        bucket = '30-40 Jahre'        
    if age in range(40, 51):
        bucket = '40-50 Jahre'        
    if age in range(50, 61):
        bucket = '50-60 Jahre'
    if age >= 61:
        bucket = '60+ Jahre'
    return bucket

In [191]:
def calorie_level(calorie):   
    """
    input: calorie value
    output: group description
    function: divide calorie value into 3 groups 
    """   
    bucket = None
    calorie = int(calorie)    
    if calorie < 300:
        bucket = 'low_calorie'    
    if calorie in range(300, 500):
        bucket = 'medium_calorie'        
    if calorie >= 500:
        bucket = 'high_calorie'      
    return bucket

In [192]:
def remove_None(data, name):
    """
    Helper function to remove None value in one column
    """ 
    y = data[data[name] == 'None']
    index_n = y.index.tolist()
    data = data.drop(index = index_n)
    return data

In [193]:
def add_target(data, calorie_level):
    df_sub_group[country] = df_dum_car[calorie_level]
    return df_sub_group

## Subgroup Discovery

In this section we will use subgroup discovery to explore the association rules between attributes

- why we choose subgroup discovery?

because we find out that subgroup discovery is quite powerful compared to other data mining techniques. As long as we set differnt target with different search space, we can use use subgroup discovery technique to dig almost all interesting pattern that we want to explore from the dataset. 


In [215]:
# read the data
data = pd.read_csv("/Users/xujingjing/Desktop/2020 Sommer/praktikum/Data/all_data.csv")

In [216]:
# extract the recipe id from recipe urls
list_cat_no = []
i = 0
for item in data['recipe_url']:
    list_cat_no.append(item.split('/')[4])
    
# add one column "recipe_id" into the dataset and set it as the index of dataset
data['recipe_id'] = list_cat_no
data = data.set_index(["recipe_id"])

## Association rules between comment user information and recipe attributes

- data imbalance: gender
- improve the quality of data (preprossing)
- sampling 
- improne quality of result: different search algorithmn and selectors choices
- if the quality is also very low, plot the t-SNE

- associations with "calorie" and comment user information

since the job has more than half non values, so here we only explore the the association between  marriage_status, gender, and age with recipe calorie

In [217]:
# extract the numerical value string in the column calorie 
pat = r"([-+]?\d*\.\d+|\d+)"
data["calorie_value"] = data["calorie"].str.extract(pat, flags=0, expand=True)

# drop all rows with nan value in both columns comment_user and calorie_value
data_com = data.dropna(subset=["comment_user",'calorie_value'])
len(data_com)

19058

In [218]:
# expand the corresponding recipe data for each comment_user
#data_com = data_com[0:500]
# data_ingrent = ingredients_preprocess(data_com)
# data_ingrent
df_com_1 = extract_com_user(data_com)
df_com_2 = sub_cat_in_com(data_com)
df_com_3 = add_recipe_info(data_com)
df_com_new = df_com_1.merge(df_com_2, on='recipe_id', how='left')
df_com_new = df_com_new.merge(df_com_3, on='recipe_id', how='left')

df_com_new = df_com_new[['recipe_id','recipe_name','tags','difficulty','preparation_time','ingredient','name','rating','sex','age','marriage_status','comment_time','calorie_value']]
df_com_new = df_com_new.set_index(["recipe_id"])
df_com_new.head()

Unnamed: 0_level_0,recipe_name,tags,difficulty,preparation_time,ingredient,name,rating,sex,age,marriage_status,comment_time,calorie_value
recipe_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1883831306402031,Aufgehende Sonne,Asien Hauptspeise Nudeln Beilage Geflügel Wok Thailand,normal,30 Min,500g NudelnMieReisnudelnoderReis 400g Geflügelfleischgeschnetzelt 1 Ananas 2 OrangenoderBlutoran...,Haubndaucher,rating-5,männlich,46 Jahre,Verheiratet,10.08.2017 14:56,500
1883831306402031,Aufgehende Sonne,Asien Hauptspeise Nudeln Beilage Geflügel Wok Thailand,normal,30 Min,500g NudelnMieReisnudelnoderReis 400g Geflügelfleischgeschnetzelt 1 Ananas 2 OrangenoderBlutoran...,Lumakath,rating-4,weiblich,49 Jahre,Verheiratet,19.01.2013 13:19,500
1883831306402031,Aufgehende Sonne,Asien Hauptspeise Nudeln Beilage Geflügel Wok Thailand,normal,30 Min,500g NudelnMieReisnudelnoderReis 400g Geflügelfleischgeschnetzelt 1 Ananas 2 OrangenoderBlutoran...,esther76,rating-4,weiblich,,Verheiratet,21.10.2012 17:56,500
2046401331198519,Bleib gesundSmoothie SuperNova,Getränk Vegetarisch Frühstück kalorienarm Vegan Frucht Shake Paleo,simpel,5 Min,1mgroße Karotten 1 Apfel 1Stücke Ingwerwurzelfrischhaselnussgroß 1 OrangenSaftdavon ½ ZitronenSa...,t1n4a,rating-1,weiblich,25 Jahre,,13.01.2020 21:05,205
2046401331198519,Bleib gesundSmoothie SuperNova,Getränk Vegetarisch Frühstück kalorienarm Vegan Frucht Shake Paleo,simpel,5 Min,1mgroße Karotten 1 Apfel 1Stücke Ingwerwurzelfrischhaselnussgroß 1 OrangenSaftdavon ½ ZitronenSa...,patty89,rating-4,weiblich,,Verheiratet,03.09.2019 13:18,205


In [219]:
# remove none value in the whole data set 
df_com_no_none = df_com_new.mask(df_com_new.astype(object).eq('None')).dropna()
df_com_new = df_com_no_none

df_com_new = df_com_new.reset_index()
df_com_new

Unnamed: 0,recipe_id,recipe_name,tags,difficulty,preparation_time,ingredient,name,rating,sex,age,marriage_status,comment_time,calorie_value
0,1883831306402031,Aufgehende Sonne,Asien Hauptspeise Nudeln Beilage Geflügel Wok Thailand,normal,30 Min,500g NudelnMieReisnudelnoderReis 400g Geflügelfleischgeschnetzelt 1 Ananas 2 OrangenoderBlutoran...,Haubndaucher,rating-5,männlich,46 Jahre,Verheiratet,10.08.2017 14:56,500
1,1883831306402031,Aufgehende Sonne,Asien Hauptspeise Nudeln Beilage Geflügel Wok Thailand,normal,30 Min,500g NudelnMieReisnudelnoderReis 400g Geflügelfleischgeschnetzelt 1 Ananas 2 OrangenoderBlutoran...,Lumakath,rating-4,weiblich,49 Jahre,Verheiratet,19.01.2013 13:19,500
2,2046401331198519,Bleib gesundSmoothie SuperNova,Getränk Vegetarisch Frühstück kalorienarm Vegan Frucht Shake Paleo,simpel,5 Min,1mgroße Karotten 1 Apfel 1Stücke Ingwerwurzelfrischhaselnussgroß 1 OrangenSaftdavon ½ ZitronenSa...,mackogreen,rating-5,weiblich,53 Jahre,Verheiratet,18.12.2017 07:03,205
3,2046401331198519,Bleib gesundSmoothie SuperNova,Getränk Vegetarisch Frühstück kalorienarm Vegan Frucht Shake Paleo,simpel,5 Min,1mgroße Karotten 1 Apfel 1Stücke Ingwerwurzelfrischhaselnussgroß 1 OrangenSaftdavon ½ ZitronenSa...,Tweetschekätzchen,rating-4,weiblich,28 Jahre,Vergeben,12.04.2017 09:55,205
4,2046401331198519,Bleib gesundSmoothie SuperNova,Getränk Vegetarisch Frühstück kalorienarm Vegan Frucht Shake Paleo,simpel,5 Min,1mgroße Karotten 1 Apfel 1Stücke Ingwerwurzelfrischhaselnussgroß 1 OrangenSaftdavon ½ ZitronenSa...,BlueLili,rating-5,weiblich,29 Jahre,Vergeben,11.06.2016 09:54,205
...,...,...,...,...,...,...,...,...,...,...,...,...,...
85068,865621192022922,Übergrillter Eisbergsalat nach Ille,Salat Gemüse Vegetarisch kalorienarm fettarm Snack EieroderKäse Überbacken Lactose LowCarb,simpel,15 Min,½ Eisbergsalat 2 Schalotten 2 Tomaten 1Kugel Mozzarellaevtlfettarm Tabasco 1Spritzer ÖlfürdieForm,Karlbig,rating-4,männlich,59 Jahre,Verheiratet,10.12.2008 17:30,511
85069,865621192022922,Übergrillter Eisbergsalat nach Ille,Salat Gemüse Vegetarisch kalorienarm fettarm Snack EieroderKäse Überbacken Lactose LowCarb,simpel,15 Min,½ Eisbergsalat 2 Schalotten 2 Tomaten 1Kugel Mozzarellaevtlfettarm Tabasco 1Spritzer ÖlfürdieForm,CJ85,rating-4,weiblich,35 Jahre,Verheiratet,07.06.2008 00:05,511
85070,865621192022922,Übergrillter Eisbergsalat nach Ille,Salat Gemüse Vegetarisch kalorienarm fettarm Snack EieroderKäse Überbacken Lactose LowCarb,simpel,15 Min,½ Eisbergsalat 2 Schalotten 2 Tomaten 1Kugel Mozzarellaevtlfettarm Tabasco 1Spritzer ÖlfürdieForm,motscheküpchen,rating-4,weiblich,39 Jahre,Verheiratet,16.01.2008 08:09,511
85071,3148171468749120,Übernacht Weizenbrötchen,Backen Vegetarisch Frühstück BrotoderBrötchen Gluten Lactose,normal,30 Min,500g WeizenmehlType550 300ml Wasser 1ELgehäuft Butterweiche 2TLgestr Salz 5g Hefefrisch Sesamode...,omaskröte,rating-5,weiblich,64 Jahre,Verheiratet,18.01.2018 12:08,493


In [221]:
# add calorie level and age group columns in the comment user information
df_com_new['calorie_level'] = df_com_new['calorie_value'].apply(calorie_level)
df_dum_car = pd.get_dummies(df_com_new['calorie_level'])
df_com_age = df_com_new
df_com_age["age_value"] = df_com_age["age"].str.extract(pat, flags=0, expand=True)
df_com_age['age_group'] = df_com_age['age_value'].apply(age_group)
df_com_car = df_com_new.join(df_dum_car, how='left')

In [242]:
import pysubgroup as ps
data = df_com_car[['sex', 'age_group', 'high_calorie', 'low_calorie', 'medium_calorie','marriage_status']]
target = ps.BinaryTarget('high_calorie', True)
searchspace = ps.create_selectors(data, ignore=['high_calorie', 'low_calorie', 'medium_calorie'])
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=2, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)
pd.set_option('max_colwidth',100)
print(result.to_dataframe())

    quality                                                  description
0  0.004155                                     age_group=='30-40 Jahre'
1  0.004053                                     age_group=='40-50 Jahre'
2  0.003791                 age_group=='40-50 Jahre' AND sex=='weiblich'
3  0.003189                 age_group=='30-40 Jahre' AND sex=='weiblich'
4  0.003122  age_group=='40-50 Jahre' AND marriage_status=='Verheiratet'


In [250]:
df_com_new['calorie_level'].value_counts()

high_calorie      46732
medium_calorie    19498
low_calorie       18843
Name: calorie_level, dtype: int64

#### Deal with imbalanced data

"weiblich": "männlich" ratio is around 5:1, the data is quite imblanced with regard to the sex, we want to try to resample the dataset to help improve the data quality. We want to see if we can improve the quality of subgroup discovery

- upsampling

In [254]:
df_sampling = df_com_age
#df_sampling = df_com_age[['rating','sex','age_group','marriage_status','calorie_level','calorie_value']]

# oversampling
# Separate input features and target
y = df_sampling['calorie_level']
X = df_sampling.drop('calorie_level', axis=1)

# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# concatenate our training data back together
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
female = df_sampling[df_sampling['sex'] == 'weiblich']
male = df_sampling[df_sampling['sex'] == 'männlich']

# upsample minority
male_upsampled = resample(male,
                          replace=True, # sample with replacement
                          n_samples=len(female), # match number in majority class
                          random_state=27) # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([female, male_upsampled])
upsampled.reset_index(inplace = True)
# upsampled['sex'].value_counts()
#upsampled.head()

In [255]:
# add the dummyset into the upsampled dataset
upsampled_dum = pd.get_dummies(upsampled['calorie_level'])
upsampled_dum.reset_index(inplace = True)
upsampled_dum = upsampled_dum.drop('index', axis=1)

# prepare for the binary target columns
upsampled = upsampled.drop('index', axis=1)
upsampled_new = upsampled.join(upsampled_dum, how='left')
# upsampled_new = upsampled_new.drop('calorie_level',1)
upsampled_new

Unnamed: 0,recipe_id,recipe_name,tags,difficulty,preparation_time,ingredient,name,rating,sex,age,marriage_status,comment_time,calorie_value,calorie_level,age_value,age_group,high_calorie,low_calorie,medium_calorie
0,1883831306402031,Aufgehende Sonne,Asien Hauptspeise Nudeln Beilage Geflügel Wok Thailand,normal,30 Min,500g NudelnMieReisnudelnoderReis 400g Geflügelfleischgeschnetzelt 1 Ananas 2 OrangenoderBlutoran...,Lumakath,rating-4,weiblich,49 Jahre,Verheiratet,19.01.2013 13:19,500,high_calorie,49,40-50 Jahre,1,0,0
1,2046401331198519,Bleib gesundSmoothie SuperNova,Getränk Vegetarisch Frühstück kalorienarm Vegan Frucht Shake Paleo,simpel,5 Min,1mgroße Karotten 1 Apfel 1Stücke Ingwerwurzelfrischhaselnussgroß 1 OrangenSaftdavon ½ ZitronenSa...,mackogreen,rating-5,weiblich,53 Jahre,Verheiratet,18.12.2017 07:03,205,low_calorie,53,50-60 Jahre,0,1,0
2,2046401331198519,Bleib gesundSmoothie SuperNova,Getränk Vegetarisch Frühstück kalorienarm Vegan Frucht Shake Paleo,simpel,5 Min,1mgroße Karotten 1 Apfel 1Stücke Ingwerwurzelfrischhaselnussgroß 1 OrangenSaftdavon ½ ZitronenSa...,Tweetschekätzchen,rating-4,weiblich,28 Jahre,Vergeben,12.04.2017 09:55,205,low_calorie,28,<30 Jahre,0,1,0
3,2046401331198519,Bleib gesundSmoothie SuperNova,Getränk Vegetarisch Frühstück kalorienarm Vegan Frucht Shake Paleo,simpel,5 Min,1mgroße Karotten 1 Apfel 1Stücke Ingwerwurzelfrischhaselnussgroß 1 OrangenSaftdavon ½ ZitronenSa...,BlueLili,rating-5,weiblich,29 Jahre,Vergeben,11.06.2016 09:54,205,low_calorie,29,<30 Jahre,0,1,0
4,2046401331198519,Bleib gesundSmoothie SuperNova,Getränk Vegetarisch Frühstück kalorienarm Vegan Frucht Shake Paleo,simpel,5 Min,1mgroße Karotten 1 Apfel 1Stücke Ingwerwurzelfrischhaselnussgroß 1 OrangenSaftdavon ½ ZitronenSa...,MrsKsusha,rating-5,weiblich,27 Jahre,Vergeben,30.03.2016 11:31,205,low_calorie,27,<30 Jahre,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145373,2115461340796639,Warm Melting Chocolate Cake,Backen Vegetarisch Dessert Kuchen Festlich Eis Gluten Lactose,normal,10 Min,200g SchokoladehalbbitterealternativzartbitterundVollmilchgemixt 125g Butter 6EL Zucker 50g Mehl...,woodroof,rating-4,männlich,50 Jahre,Verheiratet,29.12.2015 16:59,486,medium_calorie,50,50-60 Jahre,0,0,1
145374,1693671277713547,Zimt Quark Sahne Torte,Backen Vegetarisch Schnell einfach Winter Torte Weihnachten Gluten Lactose,normal,30 Min,110g Butter 250g SpekulatiusGewürzspekulatiusmitoderohneMandeln 500g Magerquark 400g Frischkäsem...,Krefelder114,rating-4,männlich,52 Jahre,Verheiratet,09.02.2014 21:13,4292,high_calorie,52,50-60 Jahre,1,0,0
145375,1321161236960435,Weich gekochtes Ei Spezialrezept,Vegetarisch einfach Frühstück Basisrezepte Kinder Eier Ostern gekocht Studentenküche Paleo ketog...,simpel,6 Min,1mgroßes EiervomHuhnweißoderbraun Wasser 1Prisen Salz,gorzitze,rating-5,männlich,43 Jahre,Verheiratet,02.01.2015 06:58,75,low_calorie,43,40-50 Jahre,0,1,0
145376,190301081068391,Bagels Grundrezept,Backen Vegetarisch USAoderKanada Frühstück Snack Basisrezepte BrotoderBrötchen Gluten Lactose,normal,45 Min,330ml Milch 50g Butter 1¼Pck HefeTrockenhefe 1TL Zucker 660g Mehl 1½TL Salz 2 Eier 2EL Zuckerfür...,MitternachtsLoewe,rating-5,männlich,35 Jahre,Vergeben,25.03.2009 14:21,266,low_calorie,35,30-40 Jahre,0,1,0


In [256]:
data = upsampled_new[['sex', 'age_group', 'high_calorie', 'low_calorie', 'medium_calorie','marriage_status']]
target = ps.BinaryTarget('high_calorie', True)
searchspace = ps.create_selectors(data, ignore=['high_calorie', 'low_calorie', 'medium_calorie'])
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=2, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)
pd.set_option('max_colwidth',100)
print(result.to_dataframe())

    quality                                                  description
0  0.004642                                     age_group=='30-40 Jahre'
1  0.003206                 age_group=='30-40 Jahre' AND sex=='männlich'
2  0.003035  age_group=='40-50 Jahre' AND marriage_status=='Verheiratet'
3  0.002619                                     age_group=='40-50 Jahre'
4  0.001931                 age_group=='40-50 Jahre' AND sex=='weiblich'


- undersampling

In [367]:
# downsample majority
female_downsampled = resample(female,
                                replace = False, # sample without replacement
                                n_samples = len(male), # match minority n
                                random_state = 27) # reproducible results

# combine minority and downsampled majority
downsampled = pd.concat([female_downsampled, male])

# add the dummyset into the downsampled dataset
downsampled_dum = pd.get_dummies(downsampled['calorie_level'])
# downsampled_dum.reset_index(inplace = True)
# downsampled.reset_index(inplace = True)
downsampled_new = downsampled.join(downsampled_dum, how='left')
downsampled_new

Unnamed: 0,recipe_id,recipe_name,tags,difficulty,preparation_time,ingredient,name,rating,sex,age,marriage_status,comment_time,calorie_value,calorie_level,age_value,age_group,high_calorie,low_calorie,medium_calorie
8029,2620971411655164,Bandnudeln an PaprikaRahmSoße mit Hüttenkäse,Gemüse Hauptspeise Nudeln Sommer Pasta Vegetarisch Saucen Schnell einfach gekocht Gluten Lactose,normal,15 Min,1 Paprikaschotenrotodergelb 2 Zwiebeln 2TL Paprikapulveredelsüß 200g saureSahne 125g Hüttenkäse ...,MonCheri2,rating-4,weiblich,59 Jahre,Verheiratet,30.07.2017 12:59,715,high_calorie,59,50-60 Jahre,1,0,0
51447,1542651260601744,Megaschokoladige Schokokekse,Backen Vegetarisch USAoderKanada Weihnachten Kekse Gluten Lactose,simpel,20 Min,175g MehlType405 1EL Backpulver 125g Butter 150g Zucker 1 Eier 200g SchokoladeZartbitter,Fronsösisch_Gurrrrmeh,rating-5,weiblich,38 Jahre,Verheiratet,17.11.2013 13:33,3311,high_calorie,38,30-40 Jahre,1,0,0
5988,2580521404745111,Armer Ritter gerollt mit Erdbeeren,Vegetarisch Brotspeise Schnell einfach Süßspeise Dessert Kinder Frucht Resteverwertung Gluten La...,normal,10 Min,4Scheiben Toastbrot 6 Erdbeeren 1mgroßes Eier 3EL Milch 1TL Zimtpulver 4TL Zucker etwas Butter 4...,schaech001,rating-5,weiblich,69 Jahre,Verwitwet,16.05.2017 15:10,710,high_calorie,69,60+ Jahre,1,0,0
46494,1477681253019979,Kürbis Gorgonzola Sauce,Gemüse Nudeln Pasta Vegetarisch Saucen Dips raffiniertoderpreiswert Winter Herbst gekocht,normal,35 Min,150g KürbisseHokkaido 1 Schalotten 2 Knoblauchzehen etwas Curry 250ml Gemüsebrühe etwas Öl einig...,soni30,rating-4,weiblich,45 Jahre,Vergeben,25.10.2015 19:35,200,low_calorie,45,40-50 Jahre,0,1,0
23971,236841096461655,Fenchel Schinken Auflauf,Fleisch Gemüse Hauptspeise Auflauf Winter Herbst Gluten Lactose LowCarb,normal,20 Min,1kg Fenchel 4EL Zitronensaft Salz 80g Butter 30g Mehl ¼Liter Brühe ¼Liter Sahne 1Bund Petersilie...,schneckoline,rating-4,weiblich,42 Jahre,Vergeben,30.04.2013 22:37,572,high_calorie,42,40-50 Jahre,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85045,865621192022922,Übergrillter Eisbergsalat nach Ille,Salat Gemüse Vegetarisch kalorienarm fettarm Snack EieroderKäse Überbacken Lactose LowCarb,simpel,15 Min,½ Eisbergsalat 2 Schalotten 2 Tomaten 1Kugel Mozzarellaevtlfettarm Tabasco 1Spritzer ÖlfürdieForm,Lurato,rating-5,männlich,61 Jahre,Verheiratet,12.04.2013 21:13,511,high_calorie,61,60+ Jahre,1,0,0
85051,865621192022922,Übergrillter Eisbergsalat nach Ille,Salat Gemüse Vegetarisch kalorienarm fettarm Snack EieroderKäse Überbacken Lactose LowCarb,simpel,15 Min,½ Eisbergsalat 2 Schalotten 2 Tomaten 1Kugel Mozzarellaevtlfettarm Tabasco 1Spritzer ÖlfürdieForm,yogi100,rating-4,männlich,59 Jahre,Single,14.08.2012 23:56,511,high_calorie,59,50-60 Jahre,1,0,0
85058,865621192022922,Übergrillter Eisbergsalat nach Ille,Salat Gemüse Vegetarisch kalorienarm fettarm Snack EieroderKäse Überbacken Lactose LowCarb,simpel,15 Min,½ Eisbergsalat 2 Schalotten 2 Tomaten 1Kugel Mozzarellaevtlfettarm Tabasco 1Spritzer ÖlfürdieForm,Tickerix,rating-4,männlich,50 Jahre,Verheiratet,27.01.2011 16:04,511,high_calorie,50,50-60 Jahre,1,0,0
85064,865621192022922,Übergrillter Eisbergsalat nach Ille,Salat Gemüse Vegetarisch kalorienarm fettarm Snack EieroderKäse Überbacken Lactose LowCarb,simpel,15 Min,½ Eisbergsalat 2 Schalotten 2 Tomaten 1Kugel Mozzarellaevtlfettarm Tabasco 1Spritzer ÖlfürdieForm,sinus1,rating-5,männlich,68 Jahre,Verheiratet,16.08.2009 19:39,511,high_calorie,68,60+ Jahre,1,0,0


In [339]:
import pysubgroup as ps
data = downsampled_new[['sex', 'age_group', 'high_calorie', 'low_calorie', 'medium_calorie','marriage_status']]
target = ps.BinaryTarget('high_calorie', True)
searchspace = ps.create_selectors(data, ignore=['high_calorie', 'low_calorie', 'medium_calorie'])
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=2, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)
pd.set_option('max_colwidth',100)
print(result.to_dataframe())

    quality                                                  description
0  0.005793                                     age_group=='30-40 Jahre'
1  0.003200     age_group=='30-40 Jahre' AND marriage_status=='Vergeben'
2  0.002959                 age_group=='30-40 Jahre' AND sex=='männlich'
3  0.002834                 age_group=='30-40 Jahre' AND sex=='weiblich'
4  0.002274  age_group=='40-50 Jahre' AND marriage_status=='Verheiratet'


#### Conclusion 

- the subgroup young single female seems to prefer the "low-calorie" recipes
- the subgroup married male seems to prefer the "high-calorie" recipes
- the subgroup divorced middle-age male seems to prefer the "medium-calorie" recipes

then our next step is to discover why differnt calorie-level is preferred by these particular groups

### Existing problems
- Here we remove all rows which have nan, does it make sense to use other fill nan methods? 
- the quality of downsampling is better than upsampling, but there are only 1040 rows left, is it enough?
- now the quality is around 0.01 is still low, how can we then to improve the quality

### Discover why differnt calorie-level is preferred by these particular groups

- set target as colorie-level/ subgroup, still use subgroup discovery

In [347]:
def get_distinct_tag(df):
    distinct_tags = []
    for i in range(len(df)):
        distinct_tags += df.iloc[i]['tags']
    return list(set(distinct_tags))

- set target as subgroup, explore the tags of recipes, use multiclass-classifier

In [368]:
downsampled_new = downsampled_new.reset_index()
downsampled_new['tags'] = downsampled_new['tags'].astype(object)
for i in range(len(downsampled_new)):
    tags = downsampled_new.iloc[i]['tags']
    tags = tags_preprocess(tags)
    downsampled_new.at[i, 'tags'] = tags

In [376]:
# prepare the data for multiclass classifier
df = downsampled_new[['tags', 'sex', 'age_group', 'marriage_status', 'calorie_level']]
distinct_tags = get_distinct_tag(df)

# One hot encoding of the ingredients
df['tags'] = df.tags.apply(convert_to_dict)
vectorizer = DictVectorizer(sparse=False)
new_df = pd.DataFrame(data = vectorizer.fit_transform(df.tags.tolist()), columns = distinct_tags)
new_df[['sex', 'age_group', 'marriage_status', 'calorie_level']] = df[['sex', 'age_group', 'marriage_status', 'calorie_level']]

# set the target value by combining 'sex', 'age_group', 'marriage_status'
new_df['combine'] = new_df[['sex', 'age_group', 'marriage_status']].values.tolist()
new_df['combine'] = new_df['combine'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [417]:
# multiclass classifer
clf_X = new_df.iloc[:, :-5]
y = new_df.iloc[:, -1]

clf = LogisticRegression(max_iter=1000)
clf.fit(clf_X, y)
y_pred = clf.predict(clf_X)

# accuracy 
print("Accuracy:",clf.score(clf_X, y))

Accuracy: 0.13319605943152454


In [408]:
def computeIDF(clf_X):
    num_document = len(clf_X)
    idf_row = clf_X.sum(axis = 0, skipna = True) 
    idf_row = len(clf_X)/idf_row
    return idf_row

def computeTF(clf_X):
    clf_X['sum'] = clf_X.sum(axis = 1, skipna = True)
    clf_X = clf_X.div(clf_X['sum'], axis=0)
    clf_X = clf_X.drop(['sum'],axis = 1)
    return clf_X

In [409]:
# use td-idf to preprocess the data
idf_row = computeIDF(clf_X)
clf_X = computeTF(clf_X)
clf_X = clf_X*idf_row
clf_X = clf_X.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [410]:
# multiclass classifier after preprocessing with td-idf
clf = LogisticRegression(max_iter=1000)
clf.fit(clf_X, y)
y_pred = clf.predict(clf_X)

# accuracy 
print("Accuracy:",clf.score(clf_X, y))

Accuracy: 0.13061208010335917


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [418]:
df = pd.DataFrame()
for i in range(len(clf.classes_)):
    if clf.classes_[i] != 'overlap':
        df[clf.classes_[i]] = [vectorizer.feature_names_[ing_id] for ing_id in np.argsort(-clf.coef_[i])[:10]]
df

Unnamed: 0,"['männlich', '30-40 Jahre', 'Geschieden']","['männlich', '30-40 Jahre', 'Single']","['männlich', '30-40 Jahre', 'Vergeben']","['männlich', '30-40 Jahre', 'Verheiratet']","['männlich', '40-50 Jahre', 'Geschieden']","['männlich', '40-50 Jahre', 'Single']","['männlich', '40-50 Jahre', 'Vergeben']","['männlich', '40-50 Jahre', 'Verheiratet']","['männlich', '40-50 Jahre', 'Verwitwet']","['männlich', '50-60 Jahre', 'Geschieden']",...,"['weiblich', '50-60 Jahre', 'Verwitwet']","['weiblich', '60+ Jahre', 'Geschieden']","['weiblich', '60+ Jahre', 'Single']","['weiblich', '60+ Jahre', 'Vergeben']","['weiblich', '60+ Jahre', 'Verheiratet']","['weiblich', '60+ Jahre', 'Verwitwet']","['weiblich', '<30 Jahre', 'Single']","['weiblich', '<30 Jahre', 'Vergeben']","['weiblich', '<30 Jahre', 'Verheiratet']","['weiblich', '<30 Jahre', 'Verwitwet']"
0,auflauf,usaoderkanada,crosscooking,grillen,spezial,ungarn,kaffeeteeoderkakao,südafrika,wok,warm,...,crosscooking,raffiniertoderpreiswert,eieroderkäse,großbritannien,japan,salatdressing,halloween,ungarn,eieroderkäse,frankreich
1,nudeln,pasta,afrika,reisodernudelsalat,marokko,innereien,klar,skandinavien,studentenküche,eintopf,...,eis,festlich,türkei,früchte,haltbarmachen,kartoffel,skandinavien,backenodersüßspeise,schweiz,vegan
2,frankreich,longdrink,punsch,marokko,backen,finnland,bowle,klöße,raffiniertoderpreiswert,früchte,...,geheimrezept,grillen,belgien,kinder,einlagen,torte,kekse,japan,marokko,brotoderbrötchen
3,dips,vietnam,russland,amerika,schweiz,konfiserie,gewürzeölessigpasten,australien,pasta,mittlererundnaherosten,...,basisrezepte,weihnachten,ernährungskonzepte,flambieren,klar,griechenland,russland,korea,türkei,europa
4,überbacken,großbritannien,silvester,flambieren,römertopf,thailand,schweden,spanien,nudeln,geheimrezept,...,marinieren,diabetiker,marinieren,raffiniertoderpreiswert,früchte,frittieren,diabetiker,nudeln,meeresfrüchte,frühstück
5,dünsten,reisgetreide,ungarn,kaffee,deutschland,norwegen,portugal,innereien,gemüse,gekocht,...,überbacken,usaoderkanada,kuchen,backen,torte,dessert,reisodernudelsalat,mexiko,ägypten,backen
6,raffiniertoderpreiswert,flambieren,polen,kakao,paleo,vietnam,eier,frittieren,schnell,shake,...,frühling,kuchen,osteuropa,fleisch,flambieren,beilage,mikrowelle,frankreich,backenodersüßspeise,gluten
7,tarte,krustentierodermuscheln,wild,tee,fleisch,krustentieroderfisch,wildgeflügel,lateinamerika,gluten,fleisch,...,festlich,deutschland,snack,türkei,fisch,camping,asien,mehlspeisen,finnland,vegetarisch
8,studentenküche,überbacken,usaoderkanada,punsch,suppe,meeresfrüchte,grundrezepte,crosscooking,einfach,kartoffeln,...,fisch,pasta,mittlererundnaherosten,norwegen,meeresfrüchte,brotoderbrötchen,dips,dips,ernährungskonzepte,korea
9,gebunden,mehlspeisen,asien,silvester,afrika,fondue,vietnam,vietnam,hauptspeise,kuchen,...,hauptspeise,winter,aufstrich,ketogen,informempfehlung,ungarn,dänemark,großbritannien,mehlspeisen,gewürzeölessigpasten


- set target as subgroup, explore the ingredients of recipes, use multiclass-classifier

In [424]:
# prepare the data for multi-class classifier
distinct_ingredients, df_2 = ingredients_preprocess(downsampled_new)

# One hot encoding of the ingredients
df_2['ingredient'] = df_2.ingredient.apply(convert_to_dict)
vectorizer = DictVectorizer(sparse=False)
new_df_2 = pd.DataFrame(data = vectorizer.fit_transform(df_2.ingredient.tolist()), columns = distinct_ingredients)
new_df_2[['sex', 'age_group', 'marriage_status', 'calorie_level']] = df_2[['sex', 'age_group', 'marriage_status', 'calorie_level']]

# set the target value by combining 'sex', 'age_group', 'marriage_status'
new_df_2['combine'] = new_df_2[['sex', 'age_group', 'marriage_status']].values.tolist()
new_df_2['combine'] = new_df_2['combine'].astype(str)

In [427]:
# multiclass classifer
clf_X = new_df_2.iloc[:, :-5]
y = new_df_2.iloc[:, -1]

clf = LogisticRegression(max_iter=1000)
clf.fit(clf_X, y)
y_pred = clf.predict(clf_X)

# accuracy 
print("Accuracy:",clf.score(clf_X, y))

Accuracy: 0.3457283591731266


In [429]:
df = pd.DataFrame()
for i in range(len(clf.classes_)):
    if clf.classes_[i] != 'overlap':
        df[clf.classes_[i]] = [vectorizer.feature_names_[ing_id] for ing_id in np.argsort(-clf.coef_[i])[:20]]
df

Unnamed: 0,"['männlich', '30-40 Jahre', 'Geschieden']","['männlich', '30-40 Jahre', 'Single']","['männlich', '30-40 Jahre', 'Vergeben']","['männlich', '30-40 Jahre', 'Verheiratet']","['männlich', '40-50 Jahre', 'Geschieden']","['männlich', '40-50 Jahre', 'Single']","['männlich', '40-50 Jahre', 'Vergeben']","['männlich', '40-50 Jahre', 'Verheiratet']","['männlich', '40-50 Jahre', 'Verwitwet']","['männlich', '50-60 Jahre', 'Geschieden']",...,"['weiblich', '50-60 Jahre', 'Verwitwet']","['weiblich', '60+ Jahre', 'Geschieden']","['weiblich', '60+ Jahre', 'Single']","['weiblich', '60+ Jahre', 'Vergeben']","['weiblich', '60+ Jahre', 'Verheiratet']","['weiblich', '60+ Jahre', 'Verwitwet']","['weiblich', '<30 Jahre', 'Single']","['weiblich', '<30 Jahre', 'Vergeben']","['weiblich', '<30 Jahre', 'Verheiratet']","['weiblich', '<30 Jahre', 'Verwitwet']"
0,zuckerambestenbraunerrohrzuckeresgehtaberauchnormaler,parmesanfrischgeriebener,brüheklarinstant,karotteninscheibengeschnitten,honig,fusilli,schokoladehalbbitter,schweinefleischlachsbraten,sahneoderfettarmersahneersatz,sahnegeschlagen,...,brennnesselndiejungenblätter,bacongewürfeltoderanderespeckwürfel,gemüsezwiebeln,zitronenschaleabgerieben,gelierzucker,tomatenstückig,nudelnspaghetti,paprikamark,puddingpulver,hefefrisch
1,salzundpfeffer,speckinfeinenstreifenoderwürfeln,kräuterlingezumstreuen,speckgewürfelter,kartoffelnmehligegeschälte,hefefrischoder,mandelngehackte,reisdietassefasst150ml,tortellonimitricottaspinatfüllungausdemkühlregal,schnittlauch,...,sahne,sauerkraut,basilikum,wasserheißes,kräutergemischte,tortillas,gnocchiausdemkühlregal,kiwis,tomatenkleine,wasserlauwarmes
2,spargelweißer,butterfürdiepfanne,schokoladetoblerone,zuckerbraunoderweiß,kardamomodertonkabohnenpulver,currypulvergelb,honigmelonen,schweinenackenoderlammfleisch,schmelzkäsefettarm,naturjoghurt,...,rinderbraten,crèmefraîchefettarm,semmelnwürfel,schokoladeweiße,zwetschgen,butterschmalzzumausbacken,zuckerbrauner,weinbrandoderrum,haferflockenzarte,mehl
3,nudelnmakkaronigegart250grohgewicht,blätterteigquadratisch,zuckerfeiner,wasserca,senfsüßer,kuvertürevollmilch,sahneauchfettreduzierte,kuvertüredunkle,brokkoli,chilischotenrot,...,sambaloelekoderscharfewürzpaste,quark20,zitronenschalegeriebene,milch15,kartoffelnkleine,honigflüssiger,schokoladevollmilchgerieben,meersalzgrob,wasserkalt,salz
4,tomatenausderdose,pfefferweißfrischgemahlen,petersiliefeingeschnitten,pfifferlingeoderchampignonsgeschnitten,zitronensaftund1pktcitroback,zwiebelngrobgewürfelt,suppengrün,hackfleischhalbundhalb,muskat,weißweinessig,...,schinkenspeckmagerdünngeschnitten,kartoffelnmehlige,schinkenserrano,bohnenkraut,schnittlauchgehackter,gelatine,schnittlauchinröllchengeschnitten,schokoladebittere,vanilleschoten,sambaloelekoderpeperoncinonachgeschmack
5,rahmspinat1tkpack,sojasaucehelle,kartoffelnmöglichstgleichgroßeundmehligkochend,mehlundevtleinwenigstärkemehl,zitronenunbehandeltoder,dillfrischodertk,maiskolbendose,champignonsinscheibengeschnitten,gemüsebrühe,chilipulver,...,salzundpfefferpfefferprisezucker,kräutersalz,zitronenabrieb,schnittlauch,weizenmehltyp550,pfefferschwarzerausdermühle,tomatenreif,mandelngeschälteundgemahlene,maultaschen,fischhornhechtca400g
6,knoblauchfeingehackter,milchfrische,bierschwarzbierersatzweisepilsodermalzbier,frischhefe,bacongewürfelt,bratwürstenürnberger,heidelbeeren,lachsgeräucherter,olivenöl,butterodermargarine,...,fischstäbchen,schnittlauch,sauerteigweizen,rotwein,schlagsahnegekühlt,himbeerentk,kartoffelnfestkochende,kuvertürezartbitterkuvertüre,himbeeren,lauchfeingewürfelt
7,zwiebelnfeingehackte,kekseamarettinis,chilisaucethaischarfundsüß,dijonsenf,blätterteig,brötchenalt,kartoffelnklein,steaksnacken,milch,vanilleschotendasmark,...,paprikaschotengemischt,knoblauchzehen,zucchinica600gramm,rosenkohl,piment,gewürzmischungschnitzelgewürz,gemüsebrüheinstant,backpulverweinsteinbackpulver,mandelngemahlen,graupengerstengraupen
8,frischkäsemitkräutern,eierbio,nutellaoderanderenussnougatcreme,kasselerausgelöstes,puderzucker,zwiebelngroß,eierganzfrischundzimmerwarm,gemüsegemischterbsenundmöhren,mehl,eigelb,...,camemberts,zitronensaft,gewürzmischungfürbrot,muskatgerieben,currythaicurry,paprikaschotenodererbsenkarottengrünebohnen,hokkaidokürbisse,vollmilchschokolade,spätzle,semmelbröselbis3el
9,pflanzenöl,keksebutterkekseevtlvollkornkekse,kartoffelnkleinefestkochendezbsiglinde,rum54,honigflüssig,pfefferschwarzer,ananasfrisch,roastbeef,salz,brühe,...,eigelb,mehl,mehlevtlmehrbiszu500g,teigkuchenteignachwahl,zitronenunbehandeltdieschale,schweinefleischgeschnetzeltes,buttermilch,sekttrocken,mohnzbmohnfixodermohnback,schweinefiletsca600g


- set target as calorie, explore the tags of recipes, use multiclass-classifier

In [414]:
# multiclass classifer
clf_X = new_df.iloc[:, :-5]
y = new_df.iloc[:, -2]

clf = LogisticRegression(max_iter=1000)
clf.fit(clf_X, y)
y_pred = clf.predict(clf_X)

# accuracy 
print("Accuracy:",clf.score(clf_X, y))

Accuracy: 0.6926679586563308


In [416]:
df = pd.DataFrame()
for i in range(len(clf.classes_)):
    if clf.classes_[i] != 'overlap':
        df[clf.classes_[i]] = [vectorizer.feature_names_[ing_id] for ing_id in np.argsort(-clf.coef_[i])[:10]]
df

Unnamed: 0,high_calorie,low_calorie,medium_calorie
0,lateinamerika,ägypten,finnland
1,likör,wildgeflügel,bowle
2,haltbarmachen,fondue,spezial
3,wursten,amerika,korea
4,afrika,babynahrung,norwegen
5,belgien,vorspeise,reisodernudelsalat
6,konfiserie,polen,reisgetreide
7,kekse,einlagen,amerika
8,australien,eier,crosscooking
9,wild,usaoderkanada,shake


### Association rules among the recipe attributes

- the association with "ingredient" and "regions"

In [None]:
countries = ['china', 'indien', 'deutschland','frankreich','grossbritannien','österreich','usaoderkanada','italien','spanien',
             'portugal', 'japen','schweiz','türkei', 'thailand', 'russland', 'großbritannien & irland', 'vietnam', 'korea',
            'australien', 'ägypten', 'marokko', 'niederlande']

In [None]:
df_certain_countries = get_recipe_countries(countries, data)
df_certain_countries.reset_index(inplace = True)
df_certain_countries['label'].value_counts()

In [None]:
# drop all the columns other than "ingredient" and "label" column
columns_drop = ['index', 'categorize', 'recipe_name', 'tags', 'avg_score', 'difficulty','rating_count', 'calorie', 
                'preparation_time','comment_user', 'recipe_url']
df_certain_countries = df_certain_countries.drop(columns_drop, axis = 1)

# preprocess the ingredient column
distinct_ingredients = ingredients_preprocess(df_certain_countries)

# One hot encoding of the ingredients
df_certain_countries['ingredient'] = df_certain_countries.ingredient.apply(convert_to_dict)
vectorizer = DictVectorizer(sparse=False)
new_df_countries = pd.DataFrame(data = vectorizer.fit_transform(df_certain_countries.ingredient.tolist()), columns = distinct_ingredients)
new_df_countries['label'] = df_certain_countries.label

# dummy for the label column
new_df_countries = pd.get_dummies(new_df_countries)

In [None]:
# apply 
new_df_countries

In [None]:
test = new_df_countries.iloc[:,]
test = test.iloc[:4]

In [None]:
country_label = list(new_df_countries.iloc[:,-19:].columns)
#contry_label.remove('label_deutschland')

# record the start time
time1 = time.time()
        
# apply subgroup discovery 
target = ps.BinaryTarget ('label_usaoderkanada', True)
searchspace = ps.create_selectors(test, ignore = country_label)
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=20, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

# record the end time
time2 = time.time()
time_diff = (time2-time1)/60
print('it took ' + str(time_diff) + 'miniutes to execute the subgroup disc')