In [1]:
# the library used
import numpy as np
import pandas as pd
import pysubgroup as ps
import re
from sklearn.feature_extraction import DictVectorizer
import time

## Helper Function

In [2]:
def tags_preprocess(tags):
    """
    input: tag string
    output: list of individual tags in the given tag string
    function: preprocess a single tag string 
    """
    tags = tags.replace("'","")
    tags = tags.replace(" ","")
    tags = tags.replace("[","")
    tags = tags.replace("]","")
    tags = tags.split(",")
    tags = [x.lower() for x in tags]
    return tags

In [3]:
def ingredients_preprocess(dataframe):
    """
    input: dataframe
    output: list of distinct ingredient list
    function: preprocess the ingredient columns and return a list of distinct ingredients
    """
    distinct_ingredients = []
    
    for i in range(len(dataframe)):
        ingredients = dataframe.iloc[i]['ingredient']
        
        r = re.compile('[A-Z]{1}[a-zA-Z]+')
        ingredients = str(ingredients)
        #ingredients = ''.join(i for i in ingredients if not i.isdigit())
        ingredients = ingredients.replace("'","")
        ingredients = ingredients.replace(" ","")
        ingredients = ingredients.replace("[","")
        ingredients = ingredients.replace("]","")
        # remove text inside parentheses
        ingredients = re.sub(r'\([^())]*\)',"", ingredients)
        ingredients = ingredients.split(",")
        ingredients = list(filter(r.match, ingredients))
        ingredients = [x.lower() for x in ingredients]
        distinct_ingredients += ingredients
        dataframe.iloc[i]['ingredient'] = ingredients
        
    return list(set(distinct_ingredients))

In [4]:
def get_recipe_countries(countries, data):
    """
    input: list of countries, dataframe
    output: selected dataframe whose recipes is from these countries
    function: select the rows in dataframe whose "tag" value contain one country tag
    """
    # add a new column class 
    drop_index = []
    for i in range(len(data)):
        tags = data.loc[i]["tags"]
        tags = tags_preprocess(tags)
        
        country_same =[l for l in countries if l in tags]
            
        if len(country_same) == 1:
            data.at[i, 'label'] = country_same[0]
        if len(country_same) == 0:
            drop_index.append(i)
        if len(country_same) > 1:
            drop_index.append(i)
            #data.at[i, 'label'] = 'overlap'
            
    # drop the columns which has no season tags
    data = data.drop(data.index[drop_index])
    return data

In [5]:
def convert_to_dict(arr):
    """
    Helper function to convect an array of ingredients to a dictionary
    """
    d={}
    for a in arr:
        d[a]=1
    return d

## Subgroup Discovery

In this section we will use subgroup discovery to explore the association rules between attributes

- why we choose subgroup discovery?

because we find out that subgroup discovery is quite powerful compared to other data mining techniques. As long as we set differnt target with different search space, we can use use subgroup discovery technique to dig almost all interesting pattern that we want to explore from the dataset. 


In [6]:
# read the data
data = pd.read_csv("/Users/xujingjing/Desktop/2020 Sommer/praktikum/Data/all_data.csv")

In [7]:
data.head()

Unnamed: 0,categorize,recipe_name,tags,avg_score,difficulty,ingredient,rating_count,calorie,preparation_time,comment_user,recipe_url
0,Menüart,"""A bis Z""-Salat","['Salat', 'Gemüse', 'Sommer', 'Vegetarisch', '...",2.67,simpel,"['1m.-große', 'Zucchini', '1', 'Apfel,säuerlic...",1.0,,15 Min,"[{'rating': 'rating-3', 'comment_time': '24.09...",https://www.chefkoch.de/rezepte/23694213760282...
1,Menüart,"""Aufgehende Sonne""","['Asien', 'Hauptspeise', 'Nudeln', 'Beilage', ...",3.6,normal,"['500g', 'Nudeln(Mie-),ReisnudelnoderReis', '4...",3.0,500kcal,30 Min,"[{'rating': 'rating-5', 'comment_time': '10.08...",https://www.chefkoch.de/rezepte/18838313064020...
2,Regional,"""Black and White"" New York Cheesecake","['Backen', 'USAoderKanada', 'Kuchen']",0.0,normal,"['250g', 'Bitterschokolade,70%', '300g', 'Schl...",0.0,,60 Min,,https://www.chefkoch.de/rezepte/36158315436795...
3,Menüart,"""Bleib gesund""-Smoothie SuperNova","['Getränk', 'Vegetarisch', 'Frühstück', 'kalor...",4.03,simpel,"['1m.-große', 'Karotte(n)', '1', 'Apfel', '1St...",84.0,205kcal,5 Min,"[{'rating': 'rating-1', 'comment_time': '13.01...",https://www.chefkoch.de/rezepte/20464013311985...
4,Menüart,"""Bottermelk Anballerste""","['Hauptspeise', 'Europa', 'Suppe', 'gebunden',...",3.6,normal,"['500g', 'Kartoffel(n)', '1', 'Zwiebel(n)', '1...",3.0,,15 Min,"[{'rating': 'rating-3', 'comment_time': '10.11...",https://www.chefkoch.de/rezepte/33875015040182...


### Association rules between comment user information and recipe attributes

### Association rules among the recipe attributes

- the association with "ingredient" and "regions"

In [8]:
countries = ['china', 'indien', 'deutschland','frankreich','grossbritannien','österreich','usaoderkanada','italien','spanien',
             'portugal', 'japen','schweiz','türkei', 'thailand', 'russland', 'großbritannien & irland', 'vietnam', 'korea',
            'australien', 'ägypten', 'marokko', 'niederlande']

In [9]:
df_certain_countries = get_recipe_countries(countries, data)
df_certain_countries.reset_index(inplace = True)
df_certain_countries['label'].value_counts()

deutschland      4288
italien          3035
usaoderkanada    1410
österreich       1062
frankreich       1023
spanien           675
china             667
indien            628
thailand          559
türkei            501
portugal          357
schweiz           341
marokko           174
russland          162
niederlande       108
australien         93
korea              65
vietnam            56
ägypten            51
Name: label, dtype: int64

In [10]:
# drop all the columns other than "ingredient" and "label" column
columns_drop = ['index', 'categorize', 'recipe_name', 'tags', 'avg_score', 'difficulty','rating_count', 'calorie', 
                'preparation_time','comment_user', 'recipe_url']
df_certain_countries = df_certain_countries.drop(columns_drop, axis = 1)

# preprocess the ingredient column
distinct_ingredients = ingredients_preprocess(df_certain_countries)

# One hot encoding of the ingredients
df_certain_countries['ingredient'] = df_certain_countries.ingredient.apply(convert_to_dict)
vectorizer = DictVectorizer(sparse=False)
new_df_countries = pd.DataFrame(data = vectorizer.fit_transform(df_certain_countries.ingredient.tolist()), columns = distinct_ingredients)
new_df_countries['label'] = df_certain_countries.label

# dummy for the label column
new_df_countries = pd.get_dummies(new_df_countries)

In [12]:
new_df_countries

Unnamed: 0,mehlundbutterfürdieform,wildschwein,menge:1/3einertasse,kasselernackenbratenohneknochen,weizen-grieß,rochenflügel,champignonsoder6normalgroße,ingwer-stück,putenkeuleca.1,klöße,...,label_portugal,label_russland,label_schweiz,label_spanien,label_thailand,label_türkei,label_usaoderkanada,label_vietnam,label_ägypten,label_österreich
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
15251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
15252,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
15253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0


In [98]:
test = new_df_countries.iloc[:,]
test = test.iloc[:4]

In [99]:
country_label = list(new_df_countries.iloc[:,-19:].columns)
#contry_label.remove('label_deutschland')

# record the start time
time1 = time.time()
        
# apply subgroup discovery 
target = ps.BinaryTarget ('label_usaoderkanada', True)
searchspace = ps.create_selectors(test, ignore = country_label)
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=20, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

# record the end time
time2 = time.time()
time_diff = (time2-time1)/60
print('it took ' + str(time_diff) + 'miniutes to execute the subgroup disc')

KeyError: 'label_usaoderkanada'

#### Existing problems with subgroup
- one-versus-other multiplier classifier
- would tf-idf preprocessed feature improve the quality?
- another problem: for more than 6000 features the pysubgroup runs pretty slow

In [88]:
import pysubgroup as ps

# Load the example dataset
from pysubgroup.tests.DataSets import get_titanic_data
data = get_titanic_data()

target = ps.BinaryTarget ('Survived', True)
searchspace = ps.create_selectors(data, ignore=['Survived'])
task = ps.SubgroupDiscoveryTask (
    data, 
    target, 
    searchspace, 
    result_set_size=5, 
    depth=2, 
    qf=ps.WRAccQF())
result = ps.BeamSearch().execute(task)

In [89]:
print(result.to_dataframe())

    quality                       description
0  0.132150                     Sex=='female'
1  0.101331        Parch==0 AND Sex=='female'
2  0.079142    Sex=='female' AND SibSp: [0:1[
3  0.077663  Cabin.isnull() AND Sex=='female'
4  0.071746   Embarked=='S' AND Sex=='female'


In [90]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
151,152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22.0,1,0,113776,66.6000,C2,S
152,153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.0500,,S
153,154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5000,,S
154,155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S
