In [20]:
import csv
import random
import pickle
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from collections import defaultdict

import nltk
from nltk import word_tokenize, pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

In [3]:
# data files
pp_recipes_file = "data/PP_recipes.csv"
pp_users_file = "data/PP_users.csv"
raw_interactions_file = "data/RAW_interactions.csv"
raw_recipes_file = "data/RAW_recipes.csv"
interactions_train_file = "data/interactions_train.csv"
interactions_validation_file = "data/interactions_validation.csv"
interactions_test_file = "data/interactions_test.csv"
# ingr_map_file = "data/ingr_map.pkl"

In [4]:
df = pd.read_csv(raw_interactions_file, sep=',')

print(len(df))
first_row = df.iloc[0]
print(first_row.to_dict())

df.head()

1132367
{'user_id': 38094, 'recipe_id': 40893, 'date': '2003-02-17', 'rating': 4, 'review': 'Great with a salad. Cooked on top of stove for 15 minutes.Added a shake of cayenne and a pinch of salt.  Used low fat sour cream.  Thanks.'}


Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."


In [6]:
# for each (user_id, recipe_id) pair, record the review
reviews_per_user_item = {}

for _, row in df.iterrows():
    user_id = int(row['user_id'])
    recipe_id = int(row['recipe_id'])
    review = row['review']
    reviews_per_user_item[(user_id, recipe_id)] = review


In [8]:
df = pd.read_csv(raw_recipes_file, sep=',')

print(len(df))
first_row = df.iloc[0]
print(first_row.to_dict())

df.head()

231637
{'name': 'arriba   baked winter squash mexican style', 'id': 137739, 'minutes': 55, 'contributor_id': 47892, 'submitted': '2005-09-16', 'tags': "['60-minutes-or-less', 'time-to-make', 'course', 'main-ingredient', 'cuisine', 'preparation', 'occasion', 'north-american', 'side-dishes', 'vegetables', 'mexican', 'easy', 'fall', 'holiday-event', 'vegetarian', 'winter', 'dietary', 'christmas', 'seasonal', 'squash']", 'nutrition': '[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]', 'n_steps': 11, 'steps': "['make a choice and proceed with recipe', 'depending on size of squash , cut into half or fourths', 'remove seeds', 'for spicy squash , drizzle olive oil or melted butter over each cut squash piece', 'season with mexican seasoning mix ii', 'for sweet squash , drizzle melted honey , butter , grated piloncillo over each cut squash piece', 'season with sweet mexican spice mix', 'bake at 350 degrees , again depending on size , for 40 minutes up to an hour , until a fork can easily pierce the skin', 

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8


In [9]:
# for each recipe_id, record the steps (as a string)
# also record other feartures: n_steps, n_ingredients, minutes, n_recipe_words

recipes_per_item = {}
features_per_item = {}

for _, row in df.iterrows():
    recipe_id = int(row['id'])
    minutes = int(row['minutes'])
    n_steps = int(row['n_steps'])
    n_ingredients = int(row['n_ingredients'])

    recipes = eval(row['steps'])
    recipes = ' '.join(recipes)
    recipe_words = recipes.replace(',', '').split()

    recipes_per_item[recipe_id] = recipes
    features_per_item[recipe_id] = [n_steps, n_ingredients, minutes, len(recipe_words)]
    

In [10]:
df = pd.read_csv(interactions_train_file, sep=',')

print(len(df))
first_row = df.iloc[0]
print(first_row.to_dict())

df.head()

698901
{'user_id': 2046, 'recipe_id': 4684, 'date': '2000-02-25', 'rating': 5.0, 'u': 22095, 'i': 44367}


Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723


In [11]:
# select 20000 entries (i.e. (user_id, recipe_id) pairs) as training set
n_training = 20000

u_i_r = []
for _, row in df.iterrows():
    user_id = int(row['user_id'])
    recipe_id = int(row['recipe_id'])
    rating = float(row['rating'])
    u_i_r.append((user_id, recipe_id, rating))

# set the seed
random.seed(66)

random.shuffle(u_i_r)

train_u_i_r = u_i_r[:n_training]

In [16]:
# collect all user_ids and recipe_ids in training set, and map them to continuous integers starting from 1
# in validation/test set, if a user_id/recipe_id is not seen in training set, map it to 0

train_user_ids = set([u for u, i, r in train_u_i_r])
train_item_ids = set([i for u, i, r in train_u_i_r])

user_id_to_u = {}
item_id_to_i = {}

for u, user_id in enumerate(list(train_user_ids), start=1):
    user_id_to_u[user_id] = u

for i, item_id in enumerate(list(train_item_ids), start=1):
    item_id_to_i[item_id] = i

# calculate average rating on training set
# also calculate average rating per user/item on training set
ratings_per_user = defaultdict(list)
ratings_per_item = defaultdict(list)
avg_rating_per_user = {}
avg_rating_per_item = {}

for u, i, r in train_u_i_r:
    ratings_per_user[u].append(r)
    ratings_per_item[i].append(r)

for u, ratings in ratings_per_user.items():
    avg_rating_per_user[u] = np.mean(ratings)

for i, ratings in ratings_per_item.items():
    avg_rating_per_item[i] = np.mean(ratings)

avg_rating = np.mean([r for u, i, r in train_u_i_r])

In [17]:
print(avg_rating)
print(len(avg_rating_per_item))
print(len(avg_rating_per_user))

4.5703
15521
6922


In [29]:
columns = [
    'u', 'i', 'rating', 'user_avg_rating', 'recipe_avg_rating',
    'n_steps', 'n_ingredients', 'minutes', 'n_recipe_words',
    'n_review_words', 'n_positive', 'n_negative', 'n_exclamation',
    'recipe_tokens', 'review_tokens'
]

In [43]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

def count_positive_negative(sentence):
    # Tokenize the sentence into words
    words = word_tokenize(sentence)
    
    # Initialize counters for positive and negative words
    positive_count = 0
    negative_count = 0
    
    # Check the sentiment of each word
    for word in words:
        sentiment = sia.polarity_scores(word)  # Get sentiment scores for the word
        if sentiment['compound'] > 0:
            positive_count += 1
        elif sentiment['compound'] < 0:
            negative_count += 1
    
    return positive_count, negative_count

def generate_dataset(u_i_r):
    dataset = []
    for u_id, i_id, rating in u_i_r:
        u = user_id_to_u.get(u_id, 0)
        i = item_id_to_i.get(i_id, 0)
        
        user_avg_rating = avg_rating_per_user.get(u_id, avg_rating)
        recipe_avg_rating = avg_rating_per_item.get(i_id, avg_rating)

        recipe = recipes_per_item[i_id]
        n_steps, n_ingredients, minutes, n_recipe_words = features_per_item[i_id]

        review = str(reviews_per_user_item[(u_id, i_id)])
        n_review_words = len(review.replace(',', '').split())
        n_positive, n_negative = count_positive_negative(review)
        n_exclamation = review.count('!')

        recipe_tokens = tokenizer.encode(recipe, truncation=True, max_length=512)
        review_tokens = tokenizer.encode(review, truncation=True, max_length=512)

        dataset.append((u, i, rating, user_avg_rating, recipe_avg_rating,
                       n_steps, n_ingredients, minutes, n_recipe_words,
                       n_review_words, n_positive, n_negative, n_exclamation,
                       recipe_tokens, review_tokens))

    return dataset

In [None]:
# build training set
train_set = generate_dataset(train_u_i_r)

In [31]:
# utility function to write csv file

def write_data_to_csv(csv_file_name, datasets):
    with open(csv_file_name, mode='w', newline='') as file:
        writer = csv.writer(file)

        writer.writerow(columns)

        for dataset in datasets:
            writer.writerows(dataset)

In [32]:
write_data_to_csv('train.csv', [train_set])

In [39]:
df = pd.read_csv(interactions_validation_file, sep=',')

print(len(df))
first_row = df.iloc[0]
print(first_row.to_dict())

df.head()

7023
{'user_id': 76535, 'recipe_id': 33627, 'date': '2005-02-15', 'rating': 4.0, 'u': 5, 'i': 177317}


Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,76535,33627,2005-02-15,4.0,5,177317
1,160497,75307,2005-10-24,4.0,23,170785
2,930021,100961,2008-11-30,4.0,31,165555
3,58439,154105,2007-03-24,4.0,44,177453
4,628951,14525,2008-02-16,5.0,45,142367


In [44]:
# generate validation set
valid_u_i_r = []
for _, row in df.iterrows():
    user_id = int(row['user_id'])
    recipe_id = int(row['recipe_id'])
    rating = float(row['rating'])
    valid_u_i_r.append((user_id, recipe_id, rating))

valid_set = generate_dataset(valid_u_i_r)
write_data_to_csv('valid.csv', [valid_set])

In [47]:
# check user_avg_rating
print(len(set([data[3] for data in valid_set])))

# check item_avg_rating
print(len(set([data[4] for data in valid_set])))

128
1


In [49]:
df = pd.read_csv(interactions_test_file, sep=',')

print(len(df))
first_row = df.iloc[0]
print(first_row.to_dict())

df.head()

12455
{'user_id': 8937, 'recipe_id': 44551, 'date': '2005-12-23', 'rating': 4.0, 'u': 2, 'i': 173538}


Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,8937,44551,2005-12-23,4.0,2,173538
1,56680,126118,2006-10-07,4.0,16,177847
2,349752,219596,2008-04-12,0.0,26,89896
3,628951,82783,2007-11-13,2.0,45,172637
4,92816,435013,2013-07-31,3.0,52,177935


In [50]:
# generate test set
test_u_i_r = []
for _, row in df.iterrows():
    user_id = int(row['user_id'])
    recipe_id = int(row['recipe_id'])
    rating = float(row['rating'])
    test_u_i_r.append((user_id, recipe_id, rating))

test_set = generate_dataset(test_u_i_r)
write_data_to_csv('test.csv', [test_set])

In [52]:
# check user_avg_rating
print(len(set([data[3] for data in test_set])))

# check item_avg_rating
print(len(set([data[4] for data in test_set])))

126
1
