In [2]:
import pandas as pd

In [3]:
df_raw = pd.read_csv("content_engagement.csv")
df_raw

Unnamed: 0,user_id,content_id,event_timestamp,event_type
0,1438eb1414,f260f3f3ce,1.642347e+09,viewed
1,5380d76552,f260f3f3ce,1.642538e+09,viewed
2,f600b96610,f260f3f3ce,1.643294e+09,engaged
3,395ee54b4e,bce874342a,1.642697e+09,viewed
4,dcbb8d2a04,f260f3f3ce,1.641336e+09,viewed
...,...,...,...,...
484072,3638877804,a842a97d1a,1.644180e+09,viewed
484073,e1903bb0ba,5c57c25574,1.642185e+09,viewed
484074,f5fa01d9b5,58bb410bc8,1.643659e+09,engaged
484075,0090363528,5c57c25574,1.640085e+09,viewed


# Build Datasets

## Make Content_id Dataframe for Model
includes ids for all content

In [None]:
df_content = pd.DataFrame(df_raw["content_id"].unique())
df_content = df_content.rename(columns={0: "content_id"})
df_content.head()

df_content.to_csv('content.csv', index=False)

## Make ratings/engaged DataFrames for loading into models
* every record in raw data is either a view or engagement.
* The rate a user engages with content they see is a proxy to an explicit rating
* as seen in the pivot table user_id vs content_id, not all content & user_id combinations have been presented. This is what the model trys to predict
* all users in dataset have engaged with at least one piece of content

In [4]:
# create a column for the number of times content was viewed. Returns 197,562 records
df_viewed = (
    df_raw.groupby(["user_id", "content_id"])["event_type"]
    .apply(lambda x: (x == "viewed").sum())
    .reset_index(name="viewed_count")
)

In [5]:
# create a column for the number of times content was engaged. Returns 197,562 records
df_engaged = (
    df_raw.groupby(["user_id", "content_id"])["event_type"]
    .apply(lambda x: (x == "engaged").sum())
    .reset_index(name="engaged_count")
)

In [6]:
df = pd.merge(df_viewed, df_engaged, on=["user_id", "content_id"], how="inner")

In [7]:
# can have instances where engaged count is higher than viewed count.
# Must add the two columns to get a total number of times a user has seen content
df["shown_count"] = df["engaged_count"] + df["viewed_count"]

In [8]:
# engaged pct is a proxy for some type of rating.
df["engaged_pct"] = df["engaged_count"] / df["shown_count"]

In [9]:
df.sort_values(by="shown_count")

Unnamed: 0,user_id,content_id,viewed_count,engaged_count,shown_count,engaged_pct
0,000a544834,34caa29b86,1,0,1,0.000000
98451,82216a2c7d,bd1201fb2a,0,1,1,1.000000
98444,82216a2c7d,86091a3dd6,0,1,1,1.000000
98440,82216a2c7d,639ae113f9,0,1,1,1.000000
98438,82216a2c7d,5149306338,1,0,1,0.000000
...,...,...,...,...,...,...
70255,5e0fb13f54,8b53482986,20,9,29,0.310345
163060,d39bca4a9c,0e5f24c27b,35,0,35,0.000000
163065,d39bca4a9c,99c6cf7932,19,32,51,0.627451
136619,b104fe8d85,563c1fdf8f,24,33,57,0.578947


In [10]:
# engageed_pct -> ratings
df_ratings = df[["user_id", "content_id", "engaged_pct"]]
len(df_ratings["content_id"].unique())

583

In [42]:
df_ratings.sort_values(by='engaged_pct')

Unnamed: 0,user_id,content_id,engaged_pct
0,000a544834,34caa29b86,0.0
102574,871f8b4bb4,5a5ad99eed,0.0
102573,871f8b4bb4,5149306338,0.0
102571,871f8b4bb4,304b40d2ce,0.0
102570,871f8b4bb4,264877c95d,0.0
...,...,...,...
24709,202b6476c8,713c7bbbd1,1.0
134296,add81aadd2,7e5b252782,1.0
134297,add81aadd2,86091a3dd6,1.0
134273,add81aadd2,1226a12aab,1.0


In [16]:
# implicit data. #120,242 user/content combinations that have engaged
df_implicit = df_raw[df_raw['event_type']=='engaged'].groupby(['user_id', 'content_id'])['event_type'].size().reset_index(name='count')

# calculate engagement pct for all user/content pairs. Then join with implicit df
df_ratings_2 = pd.merge(df_implicit, df_ratings, on=["user_id", "content_id"], how="inner")

df_ratings_2.to_csv('ratings.csv', index=False)

# EDA on features

In [22]:
len(df_raw['user_id'].unique())

7252

In [24]:
len(df_ratings['user_id'].unique())

7252

In [25]:
df_raw[df_raw['event_type']=='engaged']

Unnamed: 0,user_id,content_id,event_timestamp,event_type
2,f600b96610,f260f3f3ce,1.643294e+09,engaged
7,89f6e598dd,f260f3f3ce,1.640744e+09,engaged
9,8e977d13e6,f260f3f3ce,1.643512e+09,engaged
11,aacec99e3b,bce874342a,1.640243e+09,engaged
12,2bc1d7645d,f260f3f3ce,1.643375e+09,engaged
...,...,...,...,...
484054,55831c2909,a842a97d1a,1.643135e+09,engaged
484057,71991c6693,a842a97d1a,1.643847e+09,engaged
484059,9f20828599,58bb410bc8,1.641692e+09,engaged
484061,085cc8db5a,58bb410bc8,1.641873e+09,engaged


In [31]:
df_raw.groupby(['user_id', 'content_id']).size().reset_index(name='count')

Unnamed: 0,user_id,content_id,count
0,000a544834,34caa29b86,1
1,000a544834,3d0e786812,1
2,000a544834,5c7ee2dd80,2
3,000a544834,6f7ca40e85,2
4,000a544834,745115bd62,1
...,...,...,...
197557,fff5c815f7,b11fec4c92,1
197558,fff5c815f7,b2f77c9143,3
197559,fff5c815f7,bed9cfbff5,1
197560,fff5c815f7,c29f53db23,1


In [None]:
#from df_ratings has 197,562 unique combinations of user_id & content_id. 
# only 120,242 unique combinations that have engaged. 

In [40]:
#df_raw.groupby(['user_id', 'content_id']).apply(lambda x: x['event_type']=='engaged')

In [41]:
df_raw['event_type'].value_counts() #normalize=True

viewed     292033
engaged    192044
Name: event_type, dtype: int64