In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from itertools import combinations
from scipy import stats
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
warnings.simplefilter("ignore", FutureWarning)

In [2]:
df = pd.read_csv('recipes.csv')
reviews = pd.read_csv('reviews.csv')
stats = reviews.groupby('RecipeId').agg('size').reset_index()
stats_means = reviews.groupby('RecipeId')['Rating'].agg('mean').reset_index()
stats.rename(columns={0: "N_reviews"}, inplace = True)
stats['avg_rating'] = stats_means['Rating']
stats['all_ratings'] = reviews.groupby('RecipeId')['Rating'].apply(list).reset_index()['Rating']
df = df.merge(stats, left_on = 'RecipeId', right_on = 'RecipeId', how = 'left')
df = (df.loc[df['N_reviews']>=50])

In [3]:
def parse_ingredients(ingredient_string):
    # Remove the c( and )
    ingredient_string = ingredient_string[2:-1]
    # Split the string by ', ' and strip the quotes
    ingredients_list = [item.strip().strip('"') for item in ingredient_string.split(', ')]
    return ingredients_list

# Apply the function to the dataframe column
df['parsed_ingredients'] = df['RecipeIngredientParts'].apply(parse_ingredients)

def parse_ingredients(ingredient_string):
    # Remove the c( and )
    ingredient_string = ingredient_string[2:-1]
    # Split the string by ', ' and strip the quotes
    ingredients_list = [item.strip().strip('"') for item in ingredient_string.split(', ')]
    return ingredients_list

# Apply the function to the dataframe column
df['parsed_instructions'] = df['RecipeInstructions'].apply(parse_ingredients)

In [4]:
categories_not_plant_based = ['Chicken', 'Meat', 'Pork', 'Chicken Breast', 'Cheese', 'Poultry',
 'Cheesecake', 'Steak','Crab'] 
df = df.loc[~df['RecipeCategory'].isin(categories_not_plant_based)]

In [5]:
all_ingredients = [ingredient for sublist in df['parsed_ingredients'] for ingredient in sublist]

# Count the frequency of each item
ingredient_counts = Counter(all_ingredients)

# Get the 50 most common items
most_common_500 = ingredient_counts.most_common(500)

In [6]:
non_vegan_ingredients = [
 'butter',
 'eggs',
 'milk',
 'egg',
 'parmesan cheese',
 'unsalted butter',
 'sour cream',
 'honey',
 'cream cheese',
 'mayonnaise',
 'cheddar cheese',
 'chicken broth',
 'bacon',
 'heavy cream',
 'Worcestershire sauce',
 'ground beef',
 'buttermilk',
 'mozzarella cheese',
 'shortening',
 'cheese',
 'half-and-half',
 'evaporated milk',
 'shrimp',
 'sharp cheddar cheese',
 'skim milk',
 'monterey jack cheese',
 'feta cheese',
 'sweetened condensed milk',
 'plain yogurt',
 'ham',
 'swiss cheese',
 'beef broth',
 'chicken',
 'lean ground beef',
 'chicken breasts',
 'boneless skinless chicken breasts',
 'Cool Whip',
 'ricotta cheese',
 'red wine',
 'vanilla ice cream',
 'heavy whipping cream',
 'low sodium chicken broth',
 'fish sauce',
 'hard-boiled eggs',
 'blue cheese',
 'salmon fillets',
 'cottage cheese',
 'ground turkey',
 'nonfat milk',
 'yogurt',
 'graham cracker crumbs',
 'half-and-half cream',
 'romano cheese',
 'cooked ham',
 'tuna',
 'Velveeta cheese',
 'large shrimp',
 'provolone cheese',
 'parmigiano-reggiano cheese',
 'white chocolate chips',
 'unflavored gelatin',
 'ground pork',
 'medium shrimp',
 'prosciutto',
 'beef',
 'boneless skinless chicken breast',
 'low-fat milk',
 'chicken thighs',
 '2% low-fat milk',
 'gruyere cheese',
 'Miracle Whip',
 'ghee',
 'creme fraiche',
 'hamburger',
 'salmon',
 'sausage',
 'light sour cream',
 'oyster sauce',
 'light mayonnaise',
 'Italian sausage',
 'boneless skinless chicken breast halves',
 'turkey',
 '1% low-fat milk',
 'chicken breast',
 'monterey jack pepper cheese',
 'boneless skinless chicken thighs',
'yoghurt',
 'fresh mozzarella cheese',
 'sage',
 'lamb',
 'low-fat cheddar cheese',
 'beef bouillon cubes',
 'smoked salmon',
 'reduced-sodium chicken broth',
 'fat-free cool whip',
 'reduced-fat cream cheese']

In [7]:
df['plant_based'] = df['parsed_ingredients'].apply(lambda x: sum([(i in non_vegan_ingredients) for i in x])==0)

In [8]:
categories = ['Meal']

In [9]:
df['RecipeCategory'].replace('Lunch/Snacks', 'Meal', inplace = True)
df['RecipeCategory'].replace('One Dish Meal', 'Meal', inplace = True)
df['RecipeCategory'].replace('< 60 Mins', 'Meal', inplace = True)
df['RecipeCategory'].replace('< 30 Mins', 'Meal', inplace = True)
df['RecipeCategory'].replace('< 15 Mins', 'Meal', inplace = True)
df['RecipeCategory'].replace('Stew', 'Soup', inplace = True)

In [10]:
df = df.loc[df['RecipeCategory'].isin(categories)]
df = df.loc[~df['avg_rating'].isna()]

In [11]:
# Tokenize the names into words
df['Name_Tokens'] = df['Name'].apply(lambda x: set(x.split()))

# Generate all possible pairs of rows
pairs = list(combinations(df.index, 2))

# Calculate the intersection size for each pair
pair_overlaps = []
for (i, j) in pairs:
    overlap_size = len(df.loc[i, 'Name_Tokens'].intersection(df.loc[j, 'Name_Tokens']))
    pair_overlaps.append(((i, j), overlap_size))

# Sort the pairs based on the size of the intersection (highest first)
pair_overlaps.sort(key=lambda x: x[1], reverse=True)

In [12]:
top_1000_pairs = pair_overlaps[:30000]

# Display the results
top_1000_pairs_df = pd.DataFrame(top_1000_pairs, columns=['Pair', 'Overlap_Size'])

In [13]:
def make_decription(row):
    return row['Name']+ '\n'+\
    'Ingredients: '+ ", ".join(row['parsed_ingredients'])+'.'+'\n'+\
    'Instructions: ' + " ".join(row['parsed_instructions'])
df['description'] = df.apply(make_decription, axis = 1)

In [14]:
from scipy import stats

cnt = 0
condition = 0
conditions = []

list_pairs = []
for _,row in top_1000_pairs_df.iterrows():
    
    #print(row['Pair'][0], row['Pair'][1])
    item1 = df.loc[row['Pair'][0]]
    item2 = df.loc[row['Pair'][1]]
    
    if np.abs(item1['avg_rating'] - item2['avg_rating'])>0.5:
        if stats.ttest_ind(item1['all_ratings'],item2['all_ratings'])[1] < 0.05:
            cnt+=1

            if item1['plant_based'] and item2['plant_based']:
                condition = 1
            elif item1['plant_based'] or item2['plant_based']:
                condition = 2
            else:
                condition = 3

            conditions.append(condition)

            #print(item1['Name'], item1['avg_rating'].round(2))
            #print(item2['Name'], item2['avg_rating'].round(2))
            #print(item1['plant_based'])
            #print(item2['plant_based'])

            pair = {}

            pair['id_1'] = item1['RecipeId']
            pair['id_2'] = item2['RecipeId']

            pair['plant_based_1'] = item1['plant_based']
            pair['plant_based_2'] = item2['plant_based']

            pair['text_1'] = item1['description']
            pair['text_2'] = item2['description']

            pair['actual_score_1'] = item1['avg_rating']
            pair['actual_score_2'] = item2['avg_rating']

            pair['condition'] = condition

            list_pairs.append(pair)

In [15]:
df_pairs = pd.DataFrame(list_pairs)

In [16]:
df_pairs = df_pairs.iloc[0:500]

In [17]:
df_pairs.reset_index().to_csv('pairs_metadata.csv')