In [None]:
# df_1 = Only have user ids for people who have watched and rated a movie
# df_2 = have all movie ids

In [3]:
import datetime
import os
import pprint
import tempfile
from typing import Dict, Text

import numpy as np
import pandas as pd
import seaborn as sns
import surprise
import tensorflow as tf
import tensorflow_recommenders as tfrs

import tensorflow_datasets as tfds

In [4]:
df_raw = pd.read_csv("lovevery_mle_homework_content_engagement.csv")
df_raw

Unnamed: 0,user_id,content_id,event_timestamp,event_type
0,1438eb1414,f260f3f3ce,1.642347e+09,viewed
1,5380d76552,f260f3f3ce,1.642538e+09,viewed
2,f600b96610,f260f3f3ce,1.643294e+09,engaged
3,395ee54b4e,bce874342a,1.642697e+09,viewed
4,dcbb8d2a04,f260f3f3ce,1.641336e+09,viewed
...,...,...,...,...
484072,3638877804,a842a97d1a,1.644180e+09,viewed
484073,e1903bb0ba,5c57c25574,1.642185e+09,viewed
484074,f5fa01d9b5,58bb410bc8,1.643659e+09,engaged
484075,0090363528,5c57c25574,1.640085e+09,viewed


## Make Engaged DataFrame
* every record in raw data is either a view or engagement.
* The rate a user engages with content they see is an implicit rating
* as seen in the pivot table user_id vs content_id, not all content & user_id combinations have been presented. This is what the model trys to predict

In [5]:
# create a column for the number of times content was viewed. Returns 197,562 records
df_viewed = (
    df_raw.groupby(["user_id", "content_id"])["event_type"]
    .apply(lambda x: (x == "viewed").sum())
    .reset_index(name="viewed_count")
)

In [6]:
# create a column for the number of times content was engaged. Returns 197,562 records
df_engaged = (
    df_raw.groupby(["user_id", "content_id"])["event_type"]
    .apply(lambda x: (x == "engaged").sum())
    .reset_index(name="engaged_count")
)

In [7]:
df = pd.merge(df_viewed, df_engaged, on=["user_id", "content_id"], how="inner")

In [9]:
# can have instances where engaged count is higher than viewed count. 
# Must add the two columns to get a total number of times a user has seen content
df["shown_count"] = df["engaged_count"] + df["viewed_count"]

In [10]:
# engaged pct is a proxy for some type of rating. 
df["engaged_pct"] = df["engaged_count"] / df["shown_count"]

In [14]:
df.sort_values(by='shown_count')

Unnamed: 0,user_id,content_id,viewed_count,engaged_count,shown_count,engaged_pct
0,000a544834,34caa29b86,1,0,1,0.000000
98451,82216a2c7d,bd1201fb2a,0,1,1,1.000000
98444,82216a2c7d,86091a3dd6,0,1,1,1.000000
98440,82216a2c7d,639ae113f9,0,1,1,1.000000
98438,82216a2c7d,5149306338,1,0,1,0.000000
...,...,...,...,...,...,...
70255,5e0fb13f54,8b53482986,20,9,29,0.310345
163060,d39bca4a9c,0e5f24c27b,35,0,35,0.000000
163065,d39bca4a9c,99c6cf7932,19,32,51,0.627451
136619,b104fe8d85,563c1fdf8f,24,33,57,0.578947


In [19]:
# ratings -> engageed_pct
df_engaged = df[['user_id', 'content_id', 'engaged_pct']]
len(df_engaged['content_id'].unique())

583

## Make Content_id Dataframe for Model
includes ids for all content

In [15]:
df_content = pd.DataFrame(df_raw['content_id'].unique())

In [16]:
df_content

Unnamed: 0,0
0,f260f3f3ce
1,bce874342a
2,bbf2753b99
3,a9b6c93c3a
4,0c93cc1338
...,...
578,2ca0fce09f
579,5c57c25574
580,58bb410bc8
581,a842a97d1a
