In [1]:
# the library used
import numpy as np
import pandas as pd
import pysubgroup as ps
import re
from sklearn.feature_extraction import DictVectorizer
import time
import ast
import string 
import imblearn
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

## Helper Function

In [2]:
def tags_preprocess(tags):
    """
    input: tag string
    output: list of individual tags in the given tag string
    function: preprocess a single tag string 
    """
    #tags = tags.replace("'","")
    #tags = tags.replace(" ","")
    #tags = tags.replace("[","")
    #tags = tags.replace("]","")
    tags = str(tags)
    tags = tags.split(" ")
    tags = [x.lower() for x in tags]
    return tags

In [3]:
def ingredients_preprocess(df):
    """
    input: dataframe
    output: list of distinct ingredient list
    function: preprocess the ingredient columns and return a list of distinct ingredients
    """
    distinct_ingredients = []
    dataframe = df
    
    for i in range(len(dataframe)):
        ingredients = dataframe.iloc[i]['ingredient']
        
        r = re.compile('[A-Z]{1}[a-zA-Z]+')
        ingredients = str(ingredients)
        #ingredients = ''.join(i for i in ingredients if not i.isdigit())
        ingredients = ingredients.replace("'","")
        #ingredients = ingredients.replace(" ","")
        ingredients = ingredients.replace("[","")
        ingredients = ingredients.replace("]","")
        # remove text inside parentheses
        ingredients = re.sub(r'\([^())]*\)',"", ingredients)
        ingredients = ingredients.split(" ")
        ingredients = list(filter(r.match, ingredients))
        ingredients = [x.lower() for x in ingredients]
        distinct_ingredients += ingredients
        dataframe.at[i, 'ingredient'] = ingredients
        #dataframe.set_value(i, 'ingredient', ingredients)
        
    return [list(set(distinct_ingredients)),dataframe]

In [4]:
def get_recipe_countries(countries, data):
    """
    input: list of countries, dataframe
    output: selected dataframe whose recipes is from these countries
    function: select the rows in dataframe whose "tag" value contain one country tag
    """
    # add a new column class 
    drop_index = []
    for i in range(len(data)):
        tags = data.loc[i]["tags"]
        tags = tags_preprocess(tags)
        
        country_same =[l for l in countries if l in tags]
            
        if len(country_same) == 1:
            data.at[i, 'label'] = country_same[0]
        if len(country_same) == 0:
            drop_index.append(i)
        if len(country_same) > 1:
            drop_index.append(i)
            #data.at[i, 'label'] = 'overlap'
            
    # drop the columns which has no season tags
    data = data.drop(data.index[drop_index])
    return data

In [5]:
def convert_to_dict(arr):
    """
    Helper function to convect an array of ingredients to a dictionary
    """
    d={}
    for a in arr:
        d[a]=1
    return d

In [6]:
# extract comment user dataset from original dataset
def extract_com_user(data):
    """
    input: dataframe
    output: a new dataframe with all the comment user information
    function: spilt the dictionary of the column 'comment user' in original dataset
    """
    df_com = pd.DataFrame()
    for index, item in data['comment_user'].iteritems():
        if (item != '[]'):
            if (item != 'no comment'):
                array = ast.literal_eval(item)
                df_array = pd.DataFrame(array)
                df_array['recipe_id'] = index
                df_com = pd.concat([df_com,df_array])
    return df_com

In [7]:
def sub_cat_in_com(data):
    """
    input: dataframe
    output: a new dataframe with multi colunms
    function: add one subcategory of recipes to comment user dataset 
    """    
    punct = set(string.punctuation) 
    list_sub_cat = []
    
    df_com2 = pd.DataFrame()
    for index, item in data['calorie_value'].iteritems(): 
        if (item != None):
            list_sub = list(item)
            list_sub = ''.join(x for x in list_sub if x not in punct)
            list_sub_cat.append(list_sub)
    df_sub_cat = pd.DataFrame(list_sub_cat)
    df_sub_cat['calorie_value'] = df_sub_cat
    
    df_sub_cat['recipe_id'] = data['calorie_value'].index     
    df_com2 = pd.concat([df_com2,df_sub_cat])
    return df_com2

In [8]:
def add_recipe_info(data):
    """
    input: dataframe
    output: a new dataframe with multi colunms
    function: add one subcategory of recipes to comment user dataset 
    """    
    punct = set(string.punctuation) 
    # add recipe name in comment users data
    list_recipe_name = []
    df_recipe = pd.DataFrame()
    for index, item in data['recipe_name'].iteritems(): 
        if (item != None):
            list_name = list(item)
            list_name = ''.join(x for x in list_name if x not in punct)
            list_recipe_name.append(list_name)

    df_name = pd.DataFrame(list_recipe_name)
    df_name['recipe_id'] = data['recipe_name'].index 
    df_name = df_name.set_index(["recipe_id"])
    df_name
    df_recipe['recipe_name'] = df_name[0]

    # add recipe difficulty in comment users data
    list_recipe_diff = []
    df_recipe_diff = pd.DataFrame()
    for index, item in data['difficulty'].iteritems(): 
        if (item != None):
            list_diff = list(item)      
            list_diff = ''.join(x for x in list_diff if x not in punct)   
            list_recipe_diff.append(list_diff)

    df_diff = pd.DataFrame(list_recipe_diff)
    df_diff['recipe_id'] = data['difficulty'].index 
    df_diff = df_diff.set_index(["recipe_id"])
    df_recipe['difficulty'] = df_diff[0]

    # add recipe preparation_time in comment users data
    list_recipe_pre = []
    df_recipe_pre = pd.DataFrame()
    for index, item in data['preparation_time'].iteritems(): 
        if (item != None):
            list_pre = list(item)

            list_pre = ''.join(x for x in list_pre if x not in punct)

            list_recipe_pre.append(list_pre)

    df_pre = pd.DataFrame(list_recipe_pre)
    df_pre['recipe_id'] = data['preparation_time'].index 
    df_pre = df_pre.set_index(["recipe_id"])
    df_recipe['preparation_time'] = df_pre[0]

    # add recipe tags in comment users data
    list_recipe_tags = []
    df_recipe_tags = pd.DataFrame()
    for index, item in data['tags'].iteritems(): 
        if (item != None):
            list_tags = list(item)      
            list_tags = ''.join(x for x in list_tags if x not in punct)   
            list_recipe_tags.append(list_tags)


    df_tags = pd.DataFrame(list_recipe_tags)
    df_tags['recipe_id'] = data['tags'].index 
    df_tags = df_tags.set_index(["recipe_id"])
    df_recipe['tags'] = df_tags[0]

    # add recipe ingredient in comment users data
    list_recipe_ingredient = []
    df_recipe_ingredient = pd.DataFrame()
    for index, item in data['ingredient'].iteritems(): 
        if (item != None):
            list_ingredient = list(item) 
            list_ingredient = ''.join(x for x in list_ingredient if x not in punct)
            list_recipe_ingredient.append(list_ingredient)

    df_recipe_ingredient = pd.DataFrame(list_recipe_ingredient)
    df_recipe_ingredient['recipe_id'] = data['ingredient'].index 
    df_recipe_ingredient = df_recipe_ingredient.set_index(["recipe_id"])
    df_recipe['ingredient'] = df_recipe_ingredient[0]
    return df_recipe

In [9]:
def age_group(age):   
    """
    input: age value
    output: group description
    function: divide age value uinto 5 groups 
    """   
    bucket = None
    age = int(age)    
    if age <= 30:
        bucket = '<30 Jahre'     
    if age in range(30, 46):
        bucket = '30-45 Jahre'
    if age in range(45, 61):
        bucket = '45-60 Jahre'
    if age >= 61:
        bucket = '60+ Jahre'
    return bucket

In [10]:
def calorie_level(calorie):   
    """
    input: calorie value
    output: group description
    function: divide calorie value into 3 groups 
    """   
    bucket = None
    calorie = int(calorie)    
    if calorie <= 300:
        bucket = 'low_calorie'    
    if calorie in range(300, 500):
        bucket = 'medium_calorie'        
    if calorie >= 500:
        bucket = 'high_calorie'      
    return bucket

In [11]:
def remove_None(data, name):
    """
    Helper function to remove None value in one column
    """ 
    y = data[data[name] == 'None']
    index_n = y.index.tolist()
    data = data.drop(index = index_n)
    return data

In [12]:
def add_target(data, calorie_level):
    df_sub_group[country] = df_dum_car[calorie_level]
    return df_sub_group

In [13]:
def pre_time_group(pre_time):   
    pre_time = int(pre_time)    
    if pre_time <= 20:
        bucket = '<20 Min'    
    if pre_time in range(20, 31):
        bucket = '20-30 Min'                
    if pre_time >= 31:
        bucket = '30+ Min'
    return bucket

## Subgroup Discovery

In this section we will use subgroup discovery to explore the association rules between attributes

- why we choose subgroup discovery?

because we find out that subgroup discovery is quite powerful compared to other data mining techniques. As long as we set differnt target with different search space, we can use use subgroup discovery technique to dig many interesting patterns that we want to explore from the dataset. 


In [104]:
# read the data
# data = pd.read_csv("all_data.csv")
data = pd.read_csv("/Users/xujingjing/Desktop/2020 Sommer/praktikum/Data/all_data.csv")

In [15]:
# extract the recipe id from recipe urls
list_cat_no = []
i = 0
for item in data['recipe_url']:
    list_cat_no.append(item.split('/')[4])
    
# add one column "recipe_id" into the dataset and set it as the index of dataset
data['recipe_id'] = list_cat_no
data = data.set_index(["recipe_id"])

## Association rules between comment user information and recipe attributes

### associations with "calorie" and comment user information

since the job has more than half non values, so here we only explore the the association between  marriage_status, gender, and age with recipe calorie

In [17]:
# extract the numerical value string in the column calorie 
pat = r"([-+]?\d*\.\d+|\d+)"
data["calorie_value"] = data["calorie"].str.extract(pat, flags=0, expand=True)

# drop all rows with nan value in both columns comment_user and calorie_value
data_com = data.dropna(subset=["comment_user",'calorie_value'])
len(data_com)

19058

In [18]:
# expand the corresponding recipe data for each comment_user
df_com_1 = extract_com_user(data_com)
df_com_2 = sub_cat_in_com(data_com)
df_com_3 = add_recipe_info(data_com)
df_com_new = df_com_1.merge(df_com_2, on='recipe_id', how='left')
df_com_new = df_com_new.merge(df_com_3, on='recipe_id', how='left')

df_com_new = df_com_new[['recipe_id','recipe_name','tags','difficulty','preparation_time','ingredient','name','rating','sex','age','marriage_status','comment_time','calorie_value']]
df_com_new = df_com_new.set_index(["recipe_id"])

In [20]:
# remove none value in the whole data set 
df_com_no_none = df_com_new.mask(df_com_new.astype(object).eq('None')).dropna()
df_com_no_none = df_com_no_none.reset_index()

In [21]:
# calculate and add calorie level , preparation time and age group columns in the comment user information
df_com_no_none["pretime_value"] = df_com_no_none["preparation_time"].str.extract(pat, flags=0, expand=True)
df_com_no_none['pre_time_group'] = df_com_no_none['pretime_value'].apply(pre_time_group)

df_com_no_none['calorie_level'] = df_com_no_none['calorie_value'].apply(calorie_level)
df_dum_calorie = pd.get_dummies(df_com_no_none['calorie_level'])

df_com_no_none["age_value"] = df_com_no_none["age"].str.extract(pat, flags=0, expand=True)
df_com_no_none['age_group'] = df_com_no_none['age_value'].apply(age_group)

df_com_dum_calorie = df_com_no_none.join(df_dum_calorie, how='left')

In [24]:
df_com_dum_calorie['age_group'].value_counts()

30-45 Jahre    43292
45-60 Jahre    26865
60+ Jahre       7498
<30 Jahre       7418
Name: age_group, dtype: int64

In [25]:
df_com_dum_calorie['sex'].value_counts()

weiblich    72689
männlich    12384
Name: sex, dtype: int64

In [26]:
df_com_dum_calorie['calorie_level'].value_counts()

high_calorie      46732
medium_calorie    19498
low_calorie       18843
Name: calorie_level, dtype: int64

In [27]:
df_com_dum_calorie['marriage_status'].value_counts()

Verheiratet    46127
Vergeben       28338
Single          8703
Verwitwet       1470
Geschieden       435
Name: marriage_status, dtype: int64

#### Subgroup discovery before up/down sampling

the target is calorie and features are the sex,age,marriage status of comment users 

In [53]:
data = df_com_dum_calorie[['sex', 'age_group', 'high_calorie', 'low_calorie', 'medium_calorie','marriage_status']]
print("the length of dataset:", len(data))
target = ps.BinaryTarget('low_calorie', True)
searchspace = ps.create_selectors(data, ignore=['high_calorie', 'low_calorie', 'medium_calorie'])
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=2, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)
pd.set_option('max_colwidth',100)
print(result.to_dataframe())

the length of dataset: 85073
    quality                                   description
0  0.002450    age_group=='60+ Jahre' AND sex=='weiblich'
1  0.002306  age_group=='45-60 Jahre' AND sex=='weiblich'
2  0.002248                        age_group=='60+ Jahre'
3  0.002194                      age_group=='45-60 Jahre'
4  0.002010                               sex=='weiblich'


#### Deal with imbalanced data

"high_calorie": "medium_calorie": "low_calorie" ratio is around 2.5:1:1, the data is quite imblanced with regard to the calorie level, we want to try to resample the dataset to help improve the data quality. We want to see if we can improve the quality of subgroup discovery

- upsampling: upsample "medium_calorie" and "low_calorie" samples to make them equal to the number of "high_calorie"

In [37]:
df_sampling = df_com_dum_calorie[['rating','sex','age_group','marriage_status','calorie_level','pre_time_group','difficulty','tags','ingredient']]

In [39]:
# oversampling
# Separate input features and target
y = df_sampling['calorie_level']
X = df_sampling.drop('calorie_level', axis=1)

# setting up testing and training sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

# concatenate our training data back together
# X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
high_calorie = df_sampling[df_sampling['calorie_level'] == 'high_calorie']
low_calorie = df_sampling[df_sampling['calorie_level'] == 'low_calorie']
medium_calorie = df_sampling[df_sampling['calorie_level'] == 'medium_calorie']


# upsample minority
low_upsampled = resample(low_calorie,
                          replace=True, # sample with replacement
                          n_samples=len(high_calorie), # match number in majority class
                          random_state=27) # reproducible results

medium_upsampled = resample(medium_calorie,
                          replace=True,
                          n_samples=len(high_calorie), 
                          random_state=27) 


# combine majority and upsampled minority
upsampled = pd.concat([high_calorie, low_upsampled])
upsampled = pd.concat([upsampled, medium_upsampled])
upsampled.reset_index(inplace = True)
print("After upsampling:")
upsampled['calorie_level'].value_counts()
# upsampled.head()

After upsampling:


high_calorie      46732
medium_calorie    46732
low_calorie       46732
Name: calorie_level, dtype: int64

In [40]:
# add the dummyset into the upsampled dataset
upsampled_dum = pd.get_dummies(upsampled['calorie_level'])
upsampled_dum.reset_index(inplace = True)
upsampled_dum = upsampled_dum.drop('index', axis=1)

# prepare for the binary target columns
# upsampled = upsampled.drop('index', axis=1)
upsampled_new = upsampled.join(upsampled_dum, how='left')
# upsampled_new = upsampled_new.drop('calorie_level',1)
# upsampled_new['sex'].value_counts()

In [43]:
data = upsampled_new[['sex', 'age_group', 'high_calorie', 'low_calorie', 'medium_calorie','marriage_status']]
target = ps.BinaryTarget('medium_calorie', True)
searchspace = ps.create_selectors(data, ignore=['high_calorie', 'low_calorie', 'medium_calorie'])
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=2, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)
pd.set_option('max_colwidth',100)
print(result.to_dataframe())

    quality                                                  description
0  0.001933                                              sex=='männlich'
1  0.001555                                     age_group=='45-60 Jahre'
2  0.001448  age_group=='45-60 Jahre' AND marriage_status=='Verheiratet'
3  0.001431           marriage_status=='Verheiratet' AND sex=='männlich'
4  0.000965                 age_group=='45-60 Jahre' AND sex=='männlich'


- undersampling : downsample "high-calorie" data samples to make it equal to the number of "low_calorie" or "medium_calorie"

In [45]:
# downsample majority
high_downsampled = resample(high_calorie,
                                replace = False, # sample without replacement
                                n_samples = len(low_calorie), # match minority n
                                random_state = 27) # reproducible results

medium_downsampled = resample(medium_calorie,
                                replace = False,
                                n_samples = len(low_calorie),
                                random_state = 27)

# combine minority and downsampled majority
downsampled = pd.concat([high_downsampled, low_calorie])
downsampled = pd.concat([downsampled, medium_downsampled])

# add the dummyset into the downsampled dataset
downsampled_dum = pd.get_dummies(downsampled['calorie_level'])
# downsampled_dum.reset_index(inplace = True)
# downsampled.reset_index(inplace = True)
downsampled_new = downsampled.join(downsampled_dum, how='left')
print("After downsampling:")
downsampled_new['calorie_level'].value_counts()

After downsampling:


high_calorie      18843
medium_calorie    18843
low_calorie       18843
Name: calorie_level, dtype: int64

In [107]:
data = downsampled_new[['sex', 'age_group', 'high_calorie', 'low_calorie', 'medium_calorie','marriage_status']]
target = ps.BinaryTarget('medium_calorie', True)
searchspace = ps.create_selectors(data, ignore=['high_calorie', 'low_calorie', 'medium_calorie'])
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=2, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)
pd.set_option('max_colwidth',100)
print(result.to_dataframe())

    quality                                                  description
0  0.002353                                              sex=='männlich'
1  0.001238                                     age_group=='45-60 Jahre'
2  0.001150  age_group=='45-60 Jahre' AND marriage_status=='Verheiratet'
3  0.001067              marriage_status=='Vergeben' AND sex=='männlich'
4  0.001056           marriage_status=='Verheiratet' AND sex=='männlich'


- Explore the male and female comment users separately

In [408]:
# df_male = df_com_dum_calorie[df_com_dum_calorie['sex'] == 'männlich']
# data = df_male[['age_group', 'high_calorie', 'low_calorie', 'medium_calorie','marriage_status']]
# target = ps.BinaryTarget('low_calorie', True)
# searchspace = ps.create_selectors(data, ignore=['high_calorie', 'low_calorie', 'medium_calorie'])
# task = ps.SubgroupDiscoveryTask (
#     data, 
#     target, 
#     searchspace, 
#     result_set_size=5, 
#     depth=2, 
#     qf=ps.WRAccQF())
# result = ps.BeamSearch().execute(task)
# pd.set_option('max_colwidth',100)
# print(result.to_dataframe())

In [409]:
# df_female = df_com_dum_calorie[df_com_dum_calorie['sex'] == 'weiblich']
# data = df_female[['age_group', 'high_calorie', 'low_calorie', 'medium_calorie','marriage_status']]
# target = ps.BinaryTarget('low_calorie', True)
# searchspace = ps.create_selectors(data, ignore=['high_calorie', 'low_calorie', 'medium_calorie'])
# task = ps.SubgroupDiscoveryTask (
#     data, 
#     target, 
#     searchspace, 
#     result_set_size=5, 
#     depth=2, 
#     qf=ps.WRAccQF())
# result = ps.BeamSearch().execute(task)
# pd.set_option('max_colwidth',100)
# print(result.to_dataframe())



- the subgroup 30-45 years-old female who are married or in relationship seem to prefer the "high-calorie" recipes
- the subgroup married 45-60 years-old male or the erdly who are married seem to prefer the "medium-calorie" recipes
- the subgroup over 60 years old female or widowed female seem to prefer the "low-calorie" recipes

then our next step is to discover why differnt calorie-level is preferred by these particular groups

### Existing problems
- Here we remove all rows which have nan, does it make sense to use other fill nan methods? 
- After resampling the quality is still low, how can we then to improve the quality?
- we have downsampled to deal with the imbalance problem in target "calorie-level", but "sex", "age-group", "marriage_status" are still very imbalanced, should we also deal with that?

## Discover why differnt calorie-level is preferred by these particular groups

#### The target is calorie level and features is the preparation time and difficulty of recipes.

In [54]:
df_com_dum_calorie['pre_time_group'].value_counts()

20-30 Min    45209
<20 Min      28338
30+ Min      11526
Name: pre_time_group, dtype: int64

In [63]:
data = downsampled_new[['high_calorie', 'low_calorie', 'medium_calorie','pre_time_group','difficulty']]
target = ps.BinaryTarget('medium_calorie', True)
searchspace = ps.create_selectors(data, ignore=['high_calorie', 'low_calorie', 'medium_calorie'])
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=2, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)
pd.set_option('max_colwidth',100)
print(result.to_dataframe())

    quality                                           description
0  0.014641                                  difficulty=='normal'
1  0.013344  difficulty=='normal' AND pre_time_group=='20-30 Min'
2  0.007701                           pre_time_group=='20-30 Min'
3  0.002253    difficulty=='normal' AND pre_time_group=='<20 Min'
4  0.001285                                 difficulty=='pfiffig'


conclusion:

- 'high_calorie' recipes are high possibly with normal difficulty, takes more than 20 mininutes to cook
- 'low_calorie' recipes are high possibly with simple difficulty, takes less than 20 mininutes to cook
- 'medium_calorie' recipes are high possibly with normal or pfiffig difficulty, takes less than 20-30 mininutes to cook

- the target is the attributes of users and features is the preparation time and difficulty of recipes.

In [64]:
new_data = downsampled_new[['sex','age_group','marriage_status','pre_time_group','difficulty','calorie_level']]

#  set the target value by combining 'sex', 'age_group', 'marriage_status'
new_data['combine'] = new_data[['sex', 'age_group', 'marriage_status']].values.tolist()
new_data['combine'] = new_data['combine'].astype(str)

# one hot embedding of target value
combine_dum = pd.get_dummies(new_data['combine'])
new_data = new_data.join(combine_dum, how='left')
# combine_dum
# new_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [65]:
# discover the relationship bewteen all the comment user groups and the preparation time and difficulty of recipes

data = new_data[['pre_time_group','difficulty']]
i =0
for col in combine_dum:
    column_name = col
    column_name = str(column_name)
    data[column_name] = new_data[column_name]

    target = ps.BinaryTarget(column_name, True)
    searchspace = ps.create_selectors(data, ignore=[column_name])
    task = ps.SubgroupDiscoveryTask (
        data, 
        target, 
        searchspace, 
        result_set_size=5, 
        depth=2, 
        qf=ps.WRAccQF())
    result = ps.BeamSearch().execute(task)
    pd.set_option('max_colwidth',100)
    result = result.to_dataframe()
    result = result.rename(columns={'description':column_name})
    data = data.drop(column_name,1)
    i = i+1
    print('----------------------------------------------------------------------')
    print("number of target combination is:", i)
    print(result)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


----------------------------------------------------------------------
number of target combination is: 1
    quality             ['männlich', '30-45 Jahre', 'Geschieden']
0  0.000042  difficulty=='normal' AND pre_time_group=='20-30 Min'
1  0.000026                                  difficulty=='normal'
2  0.000023                           pre_time_group=='20-30 Min'
3  0.000000                                               Dataset
----------------------------------------------------------------------
number of target combination is: 2
    quality                 ['männlich', '30-45 Jahre', 'Single']
0  0.000351                             pre_time_group=='<20 Min'
1  0.000297    difficulty=='simpel' AND pre_time_group=='<20 Min'
2  0.000136                                  difficulty=='normal'
3  0.000115  difficulty=='normal' AND pre_time_group=='20-30 Min'
4  0.000054    difficulty=='normal' AND pre_time_group=='<20 Min'
--------------------------------------------------------------

Conclusion:

- 30-45 years-old female who are married likes "simple" or "pfiffig" food, which takes no more than 30 minuntes; 30-45 years-old female who are in relationship likes "simple" or "pfiffig" food, which takes more than 20 minuntes.
- both married female or male who are in age groups '45-60' or '60 +' seem to prefer recipes that take more time (more than 20 minintes)
- widowed female seems to prefer recipes which are simpler and take less time

- set target as subgroup, explore the tags of recipes, use multiclass-classifier

In [66]:
def get_distinct_tag(df):
    distinct_tags = []
    for i in range(len(df)):
        distinct_tags += df.iloc[i]['tags']
    return list(set(distinct_tags))

In [67]:
downsampled_new = downsampled_new.reset_index()
downsampled_new['tags'] = downsampled_new['tags'].astype(object)
for i in range(len(downsampled_new)):
    tags = downsampled_new.iloc[i]['tags']
    tags = tags_preprocess(tags)
    downsampled_new.at[i, 'tags'] = tags

In [69]:
# prepare the data for multiclass classifier
df = downsampled_new[['tags', 'sex', 'age_group', 'marriage_status', 'calorie_level']]
distinct_tags = get_distinct_tag(df)

# One hot encoding of the ingredients
df['tags'] = df.tags.apply(convert_to_dict)
vectorizer = DictVectorizer(sparse=False)
new_df = pd.DataFrame(data = vectorizer.fit_transform(df.tags.tolist()), columns = distinct_tags)
new_df[['sex', 'age_group', 'marriage_status', 'calorie_level']] = df[['sex', 'age_group', 'marriage_status', 'calorie_level']]

# set the target value by combining 'sex', 'age_group', 'marriage_status'
new_df['combine'] = new_df[['sex', 'age_group', 'marriage_status']].values.tolist()
new_df['combine'] = new_df['combine'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [81]:
# multiclass classifer
clf_X = new_df.iloc[:, :-5]
y = new_df.iloc[:, -1]

clf = LogisticRegression(max_iter=1000)
clf.fit(clf_X, y)
y_pred = clf.predict(clf_X)

# accuracy 
print("Accuracy:",clf.score(clf_X, y))

Accuracy: 0.2436802349236675


In [71]:
def computeIDF(clf_X):
    num_document = len(clf_X)
    idf_row = clf_X.sum(axis = 0, skipna = True) 
    idf_row = len(clf_X)/idf_row
    return idf_row

def computeTF(clf_X):
    clf_X['sum'] = clf_X.sum(axis = 1, skipna = True)
    clf_X = clf_X.div(clf_X['sum'], axis=0)
    clf_X = clf_X.drop(['sum'],axis = 1)
    return clf_X

In [72]:
# use td-idf to preprocess the data
idf_row = computeIDF(clf_X)
clf_X = computeTF(clf_X)
clf_X = clf_X*idf_row
clf_X = clf_X.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [73]:
# multiclass classifier after preprocessing with td-idf
clf = LogisticRegression(max_iter=1000)
clf.fit(clf_X, y)
y_pred = clf.predict(clf_X)

# accuracy 
print("Accuracy:",clf.score(clf_X, y))

Accuracy: 0.2428488032691185


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [82]:
df = pd.DataFrame()
for i in range(len(clf.classes_)):
    if clf.classes_[i] != 'overlap':
        df[clf.classes_[i]] = [vectorizer.feature_names_[ing_id] for ing_id in np.argsort(-clf.coef_[i])[:10]]
df

Unnamed: 0,"['männlich', '30-45 Jahre', 'Geschieden']","['männlich', '30-45 Jahre', 'Single']","['männlich', '30-45 Jahre', 'Vergeben']","['männlich', '30-45 Jahre', 'Verheiratet']","['männlich', '45-60 Jahre', 'Geschieden']","['männlich', '45-60 Jahre', 'Single']","['männlich', '45-60 Jahre', 'Vergeben']","['männlich', '45-60 Jahre', 'Verheiratet']","['männlich', '45-60 Jahre', 'Verwitwet']","['männlich', '60+ Jahre', 'Geschieden']",...,"['weiblich', '45-60 Jahre', 'Verwitwet']","['weiblich', '60+ Jahre', 'Geschieden']","['weiblich', '60+ Jahre', 'Single']","['weiblich', '60+ Jahre', 'Vergeben']","['weiblich', '60+ Jahre', 'Verheiratet']","['weiblich', '60+ Jahre', 'Verwitwet']","['weiblich', '<30 Jahre', 'Single']","['weiblich', '<30 Jahre', 'Vergeben']","['weiblich', '<30 Jahre', 'Verheiratet']","['weiblich', '<30 Jahre', 'Verwitwet']"
0,spezial,usaoderkanada,punsch,punsch,fleisch,innereien,eier,portugal,pilze,aufstrich,...,kuchen,kuchen,kuchen,römertopf,bowle,salatdressing,dänemark,weißrussland,finnland,beilage
1,kartoffeln,meeresfrüchte,silvester,flambieren,eintopf,backenodersüßspeise,wursten,china,braten,babynahrung,...,resteverwertung,eintopf,belgien,thailand,longdrink,griechenland,longdrink,japan,australien,gluten
2,brotoderbrötchen,frankreich,crosscooking,crosscooking,marokko,eieroderkäse,klar,weißrussland,fingerfood,raffiniertoderpreiswert,...,schmoren,kalorienarm,russland,flambieren,polen,einlagen,getränk,marokko,dänemark,vegan
3,schweiz,eier,afrika,grillen,kuchen,grillen,kaffeeteeoderkakao,wild,klar,großbritannien,...,trennkost,marokko,eieroderkäse,krustentierodermuscheln,haltbarmachen,tee,nudeln,krustentieroderfisch,mittlererundnaherosten,tarte
4,suppe,wild,russland,amerika,frankreich,vietnam,usaoderkanada,wursten,china,eieroderkäse,...,crosscooking,dips,weihnachten,mikrowelle,cocktail,kaffee,halloween,konfiserie,eieroderkäse,vollwert
5,paleo,amerika,römertopf,frittieren,warm,weißrussland,pizza,römertopf,ei,europa,...,österreich,osteuropa,mittlererundnaherosten,luxemburg,gewürzeölessigpasten,kakao,mikrowelle,ungarn,fondue,kalorienarm
6,mexiko,afrika,mexiko,lateinamerika,mittlererundnaherosten,wok,korea,russland,basisrezepte,fisch,...,reisgetreide,afrika,ernährungskonzepte,salat,portugal,grundrezepte,osteuropa,nudeln,mehlspeisen,warm
7,winter,eieroderkäse,usaoderkanada,skandinavien,früchte,norwegen,früchte,kalt,kinder,ei,...,indien,usaoderkanada,haltbarmachen,basisrezepte,marokko,blanchieren,südafrika,mexiko,informempfehlung,fettarm
8,party,norwegen,gewürzeölessigpasten,russland,einlagen,dünsten,wildgeflügel,klar,resteverwertung,überbacken,...,fingerfood,wok,creme,norwegen,grundrezepte,gewürzeölessigpasten,reisodernudelsalat,großbritannien,ernährungskonzepte,vorspeise
9,kinder,pizza,ungarn,kaffee,geheimrezept,wursten,frankreich,blanchieren,party,fettarm,...,braten,ostern,südafrika,großbritannien,wild,meeresfrüchte,diabetiker,vietnam,belgien,sommer


In [None]:
the subgroup 30-45 years-old female who are married or in relationship seem to prefer the "high-calorie" recipes
the subgroup married 45-60 years-old male or the erdly who are married seem to prefer the "medium-calorie" recipes
the subgroup over 60 years old female or widowed female seem to prefer the "low-calorie" recipes

In [83]:
a_1 =['weiblich', '30-45 Jahre', 'Verheiratet']
a_1 = str(a_1)
a_2 = ['weiblich', '30-45 Jahre', 'Vergeben']
a_2 = str(a_2)
df[[a_1,a_2]]

Unnamed: 0,"['weiblich', '30-45 Jahre', 'Verheiratet']","['weiblich', '30-45 Jahre', 'Vergeben']"
0,skandinavien,mexiko
1,babynahrung,usaoderkanada
2,konfiserie,konfiserie
3,russland,schweden
4,australien,kekse
5,torte,dänemark
6,fondue,babynahrung
7,niederlande,wildgeflügel
8,gebunden,niederlande
9,polen,torte


In [87]:
b_1 = ['männlich', '45-60 Jahre', 'Verheiratet']
b_1 = str(b_1)
columns = list(df.columns)
cols_b = [l for l in columns if "Verheiratet" in l and "60+ Jahre" in l]
cols_b.append(b_1)
df[cols_b]

Unnamed: 0,"['männlich', '60+ Jahre', 'Verheiratet']","['weiblich', '60+ Jahre', 'Verheiratet']","['männlich', '45-60 Jahre', 'Verheiratet']"
0,wursten,bowle,portugal
1,schweiz,longdrink,china
2,punsch,polen,weißrussland
3,lammoderziege,haltbarmachen,wild
4,fondue,cocktail,wursten
5,backenodersüßspeise,gewürzeölessigpasten,römertopf
6,osteuropa,portugal,russland
7,festlich,marokko,kalt
8,luxemburg,grundrezepte,klar
9,silvester,wild,blanchieren


In [88]:
cols_c = [l for l in columns if "weiblich" in l and "60+ Jahre" in l]
cols_c += [l for l in columns if "weiblich" in l and "Verwitwet" in l]
df[cols_c]

Unnamed: 0,"['weiblich', '60+ Jahre', 'Geschieden']","['weiblich', '60+ Jahre', 'Single']","['weiblich', '60+ Jahre', 'Vergeben']","['weiblich', '60+ Jahre', 'Verheiratet']","['weiblich', '60+ Jahre', 'Verwitwet']","['weiblich', '30-45 Jahre', 'Verwitwet']","['weiblich', '45-60 Jahre', 'Verwitwet']","['weiblich', '60+ Jahre', 'Verwitwet'].1","['weiblich', '<30 Jahre', 'Verwitwet']"
0,kuchen,kuchen,römertopf,bowle,salatdressing,pilze,kuchen,salatdressing,beilage
1,eintopf,belgien,thailand,longdrink,griechenland,camping,resteverwertung,griechenland,gluten
2,kalorienarm,russland,flambieren,polen,einlagen,nudeln,schmoren,einlagen,vegan
3,marokko,eieroderkäse,krustentierodermuscheln,haltbarmachen,tee,deutschland,trennkost,tee,tarte
4,dips,weihnachten,mikrowelle,cocktail,kaffee,studentenküche,crosscooking,kaffee,vollwert
5,osteuropa,mittlererundnaherosten,luxemburg,gewürzeölessigpasten,kakao,braten,österreich,kakao,kalorienarm
6,afrika,ernährungskonzepte,salat,portugal,grundrezepte,europa,reisgetreide,grundrezepte,warm
7,usaoderkanada,haltbarmachen,basisrezepte,marokko,blanchieren,gemüse,indien,blanchieren,fettarm
8,wok,creme,norwegen,grundrezepte,gewürzeölessigpasten,schnell,fingerfood,gewürzeölessigpasten,vorspeise
9,ostern,südafrika,großbritannien,wild,meeresfrüchte,einfach,braten,meeresfrüchte,sommer


- set target as subgroup, explore the ingredients of recipes, use multiclass-classifier

In [110]:
# prepare the data for multi-class classifier
distinct_ingredients, df_2 = ingredients_preprocess(downsampled_new)

# One hot encoding of the ingredients
df_2['ingredient'] = df_2.ingredient.apply(convert_to_dict)
vectorizer = DictVectorizer(sparse=False)
new_df_2 = pd.DataFrame(data = vectorizer.fit_transform(df_2.ingredient.tolist()), columns = distinct_ingredients)
new_df_2[['sex', 'age_group', 'marriage_status', 'calorie_level']] = df_2[['sex', 'age_group', 'marriage_status', 'calorie_level']]

# set the target value by combining 'sex', 'age_group', 'marriage_status'
new_df_2['combine'] = new_df_2[['sex', 'age_group', 'marriage_status']].values.tolist()
new_df_2['combine'] = new_df_2['combine'].astype(str)

In [90]:
# multiclass classifer
clf_X = new_df_2.iloc[:, :-5]
y = new_df_2.iloc[:, -1]

clf = LogisticRegression(max_iter=1000)
clf.fit(clf_X, y)
y_pred = clf.predict(clf_X)

# accuracy 
print("Accuracy:",clf.score(clf_X, y))

Accuracy: 0.3580640025473651


In [96]:
df = pd.DataFrame()
for i in range(len(clf.classes_)):
    if clf.classes_[i] != 'overlap':
        df[clf.classes_[i]] = [vectorizer.feature_names_[ing_id] for ing_id in np.argsort(-clf.coef_[i])[:10]]
df

Unnamed: 0,"['männlich', '30-45 Jahre', 'Geschieden']","['männlich', '30-45 Jahre', 'Single']","['männlich', '30-45 Jahre', 'Vergeben']","['männlich', '30-45 Jahre', 'Verheiratet']","['männlich', '45-60 Jahre', 'Geschieden']","['männlich', '45-60 Jahre', 'Single']","['männlich', '45-60 Jahre', 'Vergeben']","['männlich', '45-60 Jahre', 'Verheiratet']","['männlich', '45-60 Jahre', 'Verwitwet']","['männlich', '60+ Jahre', 'Geschieden']",...,"['weiblich', '45-60 Jahre', 'Verwitwet']","['weiblich', '60+ Jahre', 'Geschieden']","['weiblich', '60+ Jahre', 'Single']","['weiblich', '60+ Jahre', 'Vergeben']","['weiblich', '60+ Jahre', 'Verheiratet']","['weiblich', '60+ Jahre', 'Verwitwet']","['weiblich', '<30 Jahre', 'Single']","['weiblich', '<30 Jahre', 'Vergeben']","['weiblich', '<30 Jahre', 'Verheiratet']","['weiblich', '<30 Jahre', 'Verwitwet']"
0,kartoffeln,speckinfeinenstreifenoderwürfeln,kartoffelnmöglichstgleichgroßeundmehligkochend,schweinefleischlachsbraten,sahnegeschlagen,ingwer,brötchen,speckmagerer,petersilie,gewürzgurken,...,gewürze,zitronensaft,portwein,milchziegenmindestens32fett,zwetschgenoderpflaumen,joghurtgriechischer,schokoladenachwahl,milch35fett,kartoffelnvorwiegendfestkochendodermehlig,margarinevegane
1,karotten,schmelzkäse,butterflöckchen,chiliflockenodercayennepfeffer,chilischotenrot,goudamittelalter,safranpulver,schmalzzumfrittierenoderöl,chilisaucescharfe,fischfilets,...,wirsing,puderzucker50gbuntezuckerstreusel,mais,hirse,wasserkaltesoder1ei,thunfischineigenemsaft,melonengaliamelonedasfruchtfleischabwiegen,eisvanilleeisevtl,vanillegemahlene,vollkornweizenmehl
2,bacongewürfelt,basilikumfrisch,schinkengekochter,bourbonvanillezucker,schnittlauch,toastbrotweißbrot,backpulverodertrockenhefe,ingwerfrischer,saucehoisinsauce,muskat,...,haferflockenzarte,joghurttürkischensüzme,pfefferschwarzfrischgemahlen,essigweißweinessig,mangosreife,kartoffelnkleinedrillinge,gnocchiausdemkühlregal,vanilleextrakt,spekulatius,salzgroboderfeinzumbestreuen
3,sesam,sauerteig,fettzumfrittieren,paprikaschotenrotgewürfelt,vanilleschotendasmark,saucenbinderbrauner,honigmelonen,mehlzumbestäubenderarbeitsfläche,ingwerfeingehackt,frühlingszwiebeln,...,zitronenpfeffer,hackfleischvomschweinambestenthüringermett,tomatengewürfelt,zitronenabgeriebeneschaleunbehandelt,stachelbeeren,petersilieoderschnittlauch,kräuterfrischenachbeliebenschnittlauchpetersiliekressedillliebstöckeletc,kiwis,mehlevtlmehrbiszu500g,wasserfürdielauge
4,tomatenmark,knoblauchfeingewürfelt,oreganogetrocknet,honigflüssigen,eigelb,majorangerebbelt,eierleichtverquirlt,erdbeerenfrisch,sojasaucedunkle,mozzarella,...,salatgurken,knoblauchzehengeschält,lebkuchengewürz,zwiebelnodermehr,ananasfrisch,sardellenfilets,naturjoghurtfettarm150g,naturjoghurt35oder15,puddingpulver,weizenmehlvollkorn
5,kasseler,butterfürdiepfanne,brüheklarinstant,lasagneplatten,kartoffelnmehligegeschälte,tomatenfeste,sellerie,handkäse,knoblauchzehenfeingehackt,kirschenevtlschwarzkirschen,...,kräuter,paprikapulveredelsüß,thunfischfiletausderdose,kohlrabi,remoulade,pellkartoffelnvomvortag,hokkaidokürbisse,schokoladebittere,honigoder,natron
6,gemüsefond,safran,mehl405ambestenbackmehl,rum54,apfelinkleinenstücken,eieroder2,schinkenluftgetrocknetzbparmaersatzweisefrühstücksspeck,himbeerengefrorene,wasser,zuckeroderpuderzuckerzumbestäuben,...,brennnesselndiejungenblätter,mehltype405,kräuterquarkmager,stärkemehl,currythaicurry,olivenschwarzeentsteint,avocadosreif,reisjasminoderbasmatireis,champignonshalbiert,meersalz
7,hefe,blattspinattkeinmalgrobgehackt,kartoffelnkleinefestkochendezbsiglinde,karotteninscheibengeschnitten,margarinereformhaus,lebernvomschweine,butterinflöckchen,portweinroter,lauch,broterindetrockenevondunklembrotzerbröckelt,...,chilischotengemahlenoderfrischgehackt,hefefrische,olivenölkaltgepresstes,rosenkohlfrischodertiefgekühlt,paprikaschotengemischt,feigenfrisch,avocadosreife,muskatnussfrischgerieben,kuvertürezartbitter,hefefrisch
8,kartoffelngeviertelt,reisrisottoreisarborioodervialone,kräuterlingezumstreuen,zwiebelnlila,senfsüßer,bier,tomaten400g,steaksrindoderschwein,champignonskleinefrische,aromacitrobackevtl,...,ajvarscharfodermild,knoblauchzehen,trockenhefeoder30gfrischehefe,bierkannruhigeinpilssein,orangendavon1unbehandelt,pastanachwahl,schokoriegelsnickersschokonussriegel,buttervanillearomaoptional,tomatenkleine,wasserlauwarm
9,rahmspinat1tkpack,rinderhackfleisch,frühstücksspeckfeingewürfelt,kakao,mett,graupen,eiervomhuhnweißoderbraun,bandnudelnanderegehennatürlichauch,blattpetersiliegehackte,kondensmilchdosenmilchkaffeesahne,...,peperonimittelscharfvomtürkenladen,petersilie,fetakäseoderanderenschafskäsenachbelieben,salzleichtgehäuft,kondensmilchmilchmädchengezuckerte,olivenölgutes,likörcappucinooptionaleisschmecktmitaberbesser,milchvollmilch,farfalleschmetterlingsnudeln,grünkerndinkelgrobgeschrotet


In [97]:
a_1 =['weiblich', '30-45 Jahre', 'Verheiratet']
a_1 = str(a_1)
a_2 = ['weiblich', '30-45 Jahre', 'Vergeben']
a_2 = str(a_2)
df[[a_1,a_2]]

Unnamed: 0,"['weiblich', '30-45 Jahre', 'Verheiratet']","['weiblich', '30-45 Jahre', 'Vergeben']"
0,fetakäsezerbröckelt,salzundpfefferundknoblauchgewürz
1,keksebutterkekse1dominostein,doppelrahmfrischkäsemitkräutern
2,erbsenfrisch,maronengegarte
3,butterwarme,feldsalatalternativanderekleinblättrigesalatsorten
4,kräuternachgeschmack,tomatencocktailrispentomaten
5,zitronenschaleunbehandelte,fetakäsegewürfeltinöleingelegte
6,zuckerfein,kondensmilch10
7,olivenölkaltgepresst,reiswein
8,saftkarotten,puderzuckeroderzimtzucker
9,haselnüssegemahlene,mandelmilchmandeldrinkalternativhafermilchoä


In [109]:
b_1 = ['männlich', '45-60 Jahre', 'Verheiratet']
b_1 = str(b_1)
columns = list(df.columns)
cols_b = [l for l in columns if "Verheiratet" in l and "60+ Jahre" in l]
cols_b.append(b_1)
df[cols_b]

KeyError: "None of [Index(['['männlich', '45-60 Jahre', 'Verheiratet']'], dtype='object')] are in the [columns]"

In [99]:
cols_c = [l for l in columns if "weiblich" in l and "60+ Jahre" in l]
cols_c += [l for l in columns if "weiblich" in l and "Verwitwet" in l]
df[cols_c]

Unnamed: 0,"['weiblich', '60+ Jahre', 'Geschieden']","['weiblich', '60+ Jahre', 'Single']","['weiblich', '60+ Jahre', 'Vergeben']","['weiblich', '60+ Jahre', 'Verheiratet']","['weiblich', '60+ Jahre', 'Verwitwet']","['weiblich', '30-45 Jahre', 'Verwitwet']","['weiblich', '45-60 Jahre', 'Verwitwet']","['weiblich', '60+ Jahre', 'Verwitwet'].1","['weiblich', '<30 Jahre', 'Verwitwet']"
0,zitronensaft,portwein,milchziegenmindestens32fett,zwetschgenoderpflaumen,joghurtgriechischer,maultaschen,gewürze,joghurtgriechischer,margarinevegane
1,puderzucker50gbuntezuckerstreusel,mais,hirse,wasserkaltesoder1ei,thunfischineigenemsaft,crèmefraîcheoderschmand,wirsing,thunfischineigenemsaft,vollkornweizenmehl
2,joghurttürkischensüzme,pfefferschwarzfrischgemahlen,essigweißweinessig,mangosreife,kartoffelnkleinedrillinge,salzundpfefferweißer,haferflockenzarte,kartoffelnkleinedrillinge,salzgroboderfeinzumbestreuen
3,hackfleischvomschweinambestenthüringermett,tomatengewürfelt,zitronenabgeriebeneschaleunbehandelt,stachelbeeren,petersilieoderschnittlauch,paprikaschoten,zitronenpfeffer,petersilieoderschnittlauch,wasserfürdielauge
4,knoblauchzehengeschält,lebkuchengewürz,zwiebelnodermehr,ananasfrisch,sardellenfilets,gemüsebrüheinstant,salatgurken,sardellenfilets,weizenmehlvollkorn
5,paprikapulveredelsüß,thunfischfiletausderdose,kohlrabi,remoulade,pellkartoffelnvomvortag,schnittlauch,kräuter,pellkartoffelnvomvortag,natron
6,mehltype405,kräuterquarkmager,stärkemehl,currythaicurry,olivenschwarzeentsteint,champignons,brennnesselndiejungenblätter,olivenschwarzeentsteint,meersalz
7,hefefrische,olivenölkaltgepresstes,rosenkohlfrischodertiefgekühlt,paprikaschotengemischt,feigenfrisch,lauch,chilischotengemahlenoderfrischgehackt,feigenfrisch,hefefrisch
8,knoblauchzehen,trockenhefeoder30gfrischehefe,bierkannruhigeinpilssein,orangendavon1unbehandelt,pastanachwahl,wasser,ajvarscharfodermild,pastanachwahl,wasserlauwarm
9,petersilie,fetakäseoderanderenschafskäsenachbelieben,salzleichtgehäuft,kondensmilchmilchmädchengezuckerte,olivenölgutes,semmelbröselodermehlfürdieform,peperonimittelscharfvomtürkenladen,olivenölgutes,grünkerndinkelgrobgeschrotet


- set target as calorie, explore the tags of recipes, use multiclass-classifier

In [100]:
# multiclass classifer
clf_X = new_df.iloc[:, :-5]
y = new_df.iloc[:, -2]

clf = LogisticRegression(max_iter=1000)
clf.fit(clf_X, y)
y_pred = clf.predict(clf_X)

# accuracy 
print("Accuracy:",clf.score(clf_X, y))

Accuracy: 0.6494011923083727


In [101]:
df = pd.DataFrame()
for i in range(len(clf.classes_)):
    if clf.classes_[i] != 'overlap':
        df[clf.classes_[i]] = [vectorizer.feature_names_[ing_id] for ing_id in np.argsort(-clf.coef_[i])[:10]]
df

Unnamed: 0,high_calorie,low_calorie,medium_calorie
0,ananasringe,apfelmusgehtauchohne,amarantpops
1,apfelmusevtlselbstgemacht,apfelmusausdemglas,apfelessigoderweinessig
2,ananasoderfrische,acetobalsamicohell,agavendicksaftoderhonig
3,ananasfrisch,apfelinstücken,apfelgehackt
4,aalegeräuchert,amarettoca,ananassaftdose
5,apfelca80g,anisgemahlen,apfelentkerntundkleingewürfelt
6,apfelmus,acetobalsamicoacetodimodena,ananasinstückenausderdose
7,anstellgut,ajvarscharfodermild,anstellgut
8,anissternanis,ananasinstücken,acetobalsamicoacetodimodena
9,ajvarpaprikapasteausdemglas,ajvaroderpikantetomatensauce,ananasinstücken


### Association rules among the recipe attributes

- the association with "ingredient" and "regions"

In [105]:
countries = ['china', 'indien', 'deutschland','frankreich','grossbritannien','österreich','usaoderkanada','italien','spanien',
             'portugal', 'japen','schweiz','türkei', 'thailand', 'russland', 'großbritannien & irland', 'vietnam', 'korea',
            'australien', 'ägypten', 'marokko', 'niederlande']

In [None]:
df_certain_countries = get_recipe_countries(countries, data)
df_certain_countries.reset_index(inplace = True)
df_certain_countries['label'].value_counts()

In [None]:
# drop all the columns other than "ingredient" and "label" column
columns_drop = ['index', 'categorize', 'recipe_name', 'tags', 'avg_score', 'difficulty','rating_count', 'calorie', 
                'preparation_time','comment_user', 'recipe_url']
df_certain_countries = df_certain_countries.drop(columns_drop, axis = 1)

# preprocess the ingredient column
distinct_ingredients = ingredients_preprocess(df_certain_countries)

# One hot encoding of the ingredients
df_certain_countries['ingredient'] = df_certain_countries.ingredient.apply(convert_to_dict)
vectorizer = DictVectorizer(sparse=False)
new_df_countries = pd.DataFrame(data = vectorizer.fit_transform(df_certain_countries.ingredient.tolist()), columns = distinct_ingredients)
new_df_countries['label'] = df_certain_countries.label

# dummy for the label column
new_df_countries = pd.get_dummies(new_df_countries)

In [None]:
# apply 
new_df_countries

In [None]:
test = new_df_countries.iloc[:,]
test = test.iloc[:4]

In [None]:
country_label = list(new_df_countries.iloc[:,-19:].columns)
#contry_label.remove('label_deutschland')

# record the start time
time1 = time.time()
        
# apply subgroup discovery 
target = ps.BinaryTarget ('label_usaoderkanada', True)
searchspace = ps.create_selectors(test, ignore = country_label)
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=20, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

# record the end time
time2 = time.time()
time_diff = (time2-time1)/60
print('it took ' + str(time_diff) + 'miniutes to execute the subgroup disc')