In [110]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [132]:
df_interactions = pd.read_csv('../data/interactions.csv')

In [114]:
df_interactions.count()

user_id      1124532
recipe_id    1124532
rating       1124532
dtype: int64

In [133]:
df_original_interactions = pd.read_csv("../data/revised_interactions.csv")

In [120]:
df_original_interactions.count()

user_id      1124532
recipe_id    1124532
rating       1124532
review       1124365
date         1124532
dtype: int64

In [134]:
df_merged = df_interactions.merge(df_original_interactions[["user_id", "recipe_id", "date"]], on=["user_id", "recipe_id"])
df_interactions = df_merged

In [135]:
df_interactions.count()

user_id      1124532
recipe_id    1124532
rating       1124532
date         1124532
dtype: int64

In [136]:
df_recipe_popularity = pd.read_csv('../data/recipe_popularity.csv')

In [137]:
df_interactions[["user_id"]].drop_duplicates().count()

user_id    224715
dtype: int64

In [77]:
df_interactions.groupby(["user_id"]).count().sort_values("rating", ascending=False).reset_index().head(100)

Unnamed: 0,user_id,recipe_id,rating
0,6043f4dcb8458600693a374d,7665,7665
1,60cbbe3bed6fcb006949d796,5587,5587
2,60cbbe00348edc006a22c826,4621,4621
3,60cbbdb8ed6fcb006949d771,4073,4073
4,60cbbd7c7f82d100681b3386,3912,3912
...,...,...,...
95,2875f5879dad4118aff113d0,1010,1010
96,64bbfd442dd8434f8bd71234,994,994
97,05f01b5ce879451c9deb2c37,991,991
98,ed5ae65ae48d424a8bfc4332,982,982


In [138]:
df_top_reviewers = df_interactions.groupby(["user_id"]).count().sort_values("rating", ascending=False).reset_index()["user_id"]
#df_top_reviewers = df_top_reviewers.head(250)
df_bottom_reviewers = df_interactions.groupby(["user_id"]).count().sort_values("rating", ascending=True).reset_index()["user_id"]
#df_bottom_reviewers = df_bottom_reviewers.head(5000)

In [139]:
df_reviewers = pd.concat([df_top_reviewers, df_bottom_reviewers]).drop_duplicates()

In [140]:
df_reviews = df_interactions.loc[df_interactions["user_id"].isin(df_reviewers)]
df_reviews = df_reviews.reset_index(drop=True)

In [141]:
df_reviews.count()

user_id      1124532
recipe_id    1124532
rating       1124532
date         1124532
dtype: int64

In [51]:
df_recipe_popularity.count()

recipe_id              231316
submitted              231316
rating_count           231316
rating_sum             231316
rating_avg             231316
rating_weighted_avg    231316
rank                   231316
dtype: int64

In [142]:
df_joined = df_reviews.merge(df_recipe_popularity[["recipe_id", "rating_count", "rating_sum", "rating_avg", "rating_weighted_avg", "rank"]], on=["recipe_id"], how="left")

In [143]:
df_joined.count()

user_id                1124532
recipe_id              1124532
rating                 1124532
date                   1124532
rating_count           1124532
rating_sum             1124532
rating_avg             1124532
rating_weighted_avg    1124532
rank                   1124532
dtype: int64

In [144]:
df_joined["user_id"] = df_joined["user_id"].apply(lambda x: f"auth0|{x}")

In [145]:
df_joined.head()

Unnamed: 0,user_id,recipe_id,rating,date,rating_count,rating_sum,rating_avg,rating_weighted_avg,rank
0,auth0|37a18c3939764ac9b4d72bda,b355f2ab-a8e3-4c69-833c-3ac31f5b0022,4,2017-09-04,701,3013,4.298146,4.303293,214372
1,auth0|794e065d91d74983bbec1e2d,5d063835-3353-4072-8742-754a2b6931bf,4,2007-02-05,4,17,4.25,4.342316,176924
2,auth0|b17d1cd98ba34cb89cf03827,4fd380ff-2998-418b-aa0f-7bf262452e5b,5,2010-03-06,3,10,3.333333,4.311412,206756
3,auth0|bc88bec56fa54115ae355987,b4a5a1b7-3015-4a5b-ae59-9f79fb6d7bb5,5,2008-04-17,51,243,4.764706,4.505832,619
4,auth0|bfa42caef31f42f2871ebd8d,4f128c81-e2ab-449c-91ca-194ab31b2d6e,5,2009-04-16,14,64,4.571429,4.37919,22109


In [146]:
train, test = train_test_split(df_joined, test_size=0.2)

In [147]:
train.to_csv('../data/ml_dotnet_interactions_train.csv', index=False)
test.to_csv('../data/ml_dotnet_interactions_test.csv', index=False)

In [75]:
df_joined[['user_id', 'recipe_id', 'rating']].to_csv('../data/new_ml_dotnet_interactions.csv', index=False)

In [109]:
df_joined.to_csv('../data/new_ml_dotnet_interactions.csv', index=False)

In [15]:
df_interactions[['user_id', 'recipe_id', 'rating']].to_csv('../data/ml_dotnet_interactions.csv', index=False)

In [12]:
df_tmp = df_interactions.query("user_id == 'auth0|6043f4dcb8458600693a374d'")

In [14]:
df_tmp.count()

user_id      7665
recipe_id    7665
rating       7665
dtype: int64

In [10]:
df_interactions["user_id"] = df_interactions["user_id"].apply(lambda x: f"auth0|{x}")

In [19]:
df_interactions_count_by_user_id = df_interactions \
  .groupby(by=['user_id']) \
  .count()['recipe_id'] \
  .reset_index() \
  .sort_values(by=['recipe_id'], ascending=False) \
  .reset_index() \
  .head(10)

In [20]:
df_interactions_count_by_user_id

Unnamed: 0,index,user_id,recipe_id
0,84420,6043f4dcb8458600693a374d,7665
1,178441,cb75fcf8511e495f84088d67,5587
2,85651,61b19ce00265408e90f00ff1,4621
3,29352,217fafc0c55a469d8c9db938,4073
4,123195,8cad7584ce2d4915a8501cc0,3912
5,66752,4c05437200ee4c72a8b87e8a,3346
6,67894,4d506f1bab294bf8b0b1e1e0,3286
7,130526,94fb03af3a3a486a9330f736,3105
8,191512,da508cd928924cbaa0f5e540,3014
9,152894,ae99544697e74e8d9f70ae49,2890


In [33]:
df_revised_interactions = df_interactions.copy()
df_revised_interactions = df_revised_interactions \
  .replace(to_replace=['6043f4dcb8458600693a374d'], value='6043f4dcb8458600693a374d') \
  .replace(to_replace=['cb75fcf8511e495f84088d67'], value='60cbbe3bed6fcb006949d796') \
  .replace(to_replace=['61b19ce00265408e90f00ff1'], value='60cbbe00348edc006a22c826') \
  .replace(to_replace=['217fafc0c55a469d8c9db938'], value='60cbbdb8ed6fcb006949d771') \
  .replace(to_replace=['8cad7584ce2d4915a8501cc0'], value='60cbbd7c7f82d100681b3386') \
  .replace(to_replace=['4c05437200ee4c72a8b87e8a'], value='60cbbd3fd2fc940071e41a2e') \
  .replace(to_replace=['4d506f1bab294bf8b0b1e1e0'], value='60cbbd04348edc006a22c7ce') \
  .replace(to_replace=['94fb03af3a3a486a9330f736'], value='60cbbcd3348edc006a22c7b6') \
  .replace(to_replace=['da508cd928924cbaa0f5e540'], value='60cbbc6c348edc006a22c794') \
  .replace(to_replace=['ae99544697e74e8d9f70ae49'], value='60c90b265d89a500699d0111')

In [34]:
df_revised_interactions_count_by_user_id = df_revised_interactions \
  .groupby(by=['user_id']) \
  .count()['recipe_id'] \
  .reset_index() \
  .sort_values(by=['recipe_id'], ascending=False) \
  .reset_index() \
  .head(10)

In [35]:
df_revised_interactions_count_by_user_id.head(10)

Unnamed: 0,index,user_id,recipe_id
0,84417,6043f4dcb8458600693a374d,7665
1,84881,60cbbe3bed6fcb006949d796,5587
2,84880,60cbbe00348edc006a22c826,4621
3,84879,60cbbdb8ed6fcb006949d771,4073
4,84878,60cbbd7c7f82d100681b3386,3912
5,84877,60cbbd3fd2fc940071e41a2e,3346
6,84876,60cbbd04348edc006a22c7ce,3286
7,84875,60cbbcd3348edc006a22c7b6,3105
8,84874,60cbbc6c348edc006a22c794,3014
9,84860,60c90b265d89a500699d0111,2890


In [36]:
df_revised_interactions.to_csv('../data/revised_interactions.csv', index=False)

In [40]:
df_revised_interactions[['user_id', 'recipe_id', 'rating']].to_csv('../data/ml_dotnet_interactions.csv', index=False)