In [295]:
from datetime import datetime
import json
import pandas as pd
import numpy as np

In [296]:
df_activities = pd.read_json('activities.json')
movies = pd.read_pickle('full_movies.pkl')

## User/Content Matrices

In [299]:
# Variables for adjusting formula

clicks = 0.3
like = 10
dislike = -10

# current formula is clicks * 0.3 +/-10 (+10 for like and -10 for dislike)

In [300]:
# getting likes/dislikes
likes_dislikes = df_activities.loc[np.where((df_activities['activity'] == 'Like') ^ (df_activities['activity'] == 'Dislike'))]

# latest activity at the bottom
likes_dislikes.sort_values('datetime', inplace = True)

# filtering out all like/dislike that is not at the bottom
latest = likes_dislikes.groupby(['user_id', 'content_id']).last()


# getting the right shape
latest.drop(columns = 'datetime', inplace = True)

unstacked = latest.unstack()

unstacked.columns = unstacked.columns.droplevel()

unstacked.replace({'Dislike':dislike, 'Like':like}, inplace = True)

unstacked.fillna(0, inplace = True)

unstacked

content_id,0,1,3,4,5,7,8,9,13,16,...,480,483,486,487,491,492,493,494,496,498
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,-10.0,0.0,0.0,-10.0,0.0,0.0,0.0,-10.0,0.0,...,-10.0,0.0,0.0,0.0,10.0,0.0,-10.0,-10.0,10.0,-10.0
2,0.0,0.0,0.0,0.0,0.0,10.0,10.0,-10.0,0.0,-10.0,...,10.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,-10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,10.0,0.0,0.0,-10.0,0.0,0.0,0.0,0.0
5,10.0,0.0,10.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0


In [301]:
# turning it into sparse matrix

likes_matrix = np.zeros((5, 501))

users = [i -1 for i in list(pivot_plays.index)]
cols = list(unstacked.columns)

for i in users:
    for j in cols:
        likes_matrix[i, j] = unstacked.loc[i +1, j]
        
likes_matrix

array([[  0., -10.,   0., ..., -10.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0., -10.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [ 10.,   0.,   0., ...,   0.,   0.,   0.]])

In [302]:
# creating sparse matrix for plays

plays = df_activities[df_activities['activity'] == 'Play']

grouped_plays = plays.groupby(['user_id', 'content_id']).count()

pivot_plays = grouped_plays.pivot_table(index='user_id', columns = 'content_id', values = 'activity')

pivot_plays.fillna(0, inplace = True)

plays_matrix = np.zeros((5, 501))

users = [i for i in range(5)]
cols = list(pivot_plays.columns)

for i in users:
    for j in cols:
        plays_matrix[i, j] = pivot_plays.loc[i+1, j] * clicks
        
plays_matrix

array([[0. , 0. , 0. , ..., 0. , 0. , 0.3],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [303]:
scores_matrix = plays_matrix + likes_matrix

scores_df = pd.DataFrame(scores_matrix)

scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,500
0,0.0,-10.0,0.0,0.0,0.0,-10.0,0.0,0.0,0.0,0.0,...,10.0,0.0,-10.0,-10.0,0.0,10.0,0.0,-10.0,0.0,0.3
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,10.3,-10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,-10.0,0.0,0.0,0.0,0.0,...,0.0,-10.0,0.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10.0,0.0,0.0,10.0,-10.0,0.0,0.0,0.0,0.0,0.0,...,0.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Stuff below is probably garbage but not scrapping it yet to be safe

In [285]:
unstacked.columns = unstacked.columns.droplevel()

unstacked

ValueError: Cannot remove 1 levels from an index with 1 levels: at least one level must be left.

In [223]:
unstacked.replace({'Dislike':-10, 'Like':10}, inplace = True)

In [224]:
pivot_plays

content_id,0,3,7,8,19,21,34,37,39,43,...,460,465,466,478,480,483,486,487,491,496
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [225]:
pivot_plays.fillna(0, inplace = True)
unstacked.fillna(0, inplace = True)

In [226]:
likes = unstacked

likes

content_id,0,1,3,4,5,7,8,9,13,16,...,480,483,486,487,491,492,493,494,496,498
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,-10.0,0.0,0.0,-10.0,0.0,0.0,0.0,-10.0,0.0,...,-10.0,0.0,0.0,0.0,10.0,0.0,-10.0,-10.0,10.0,-10.0
2,0.0,0.0,0.0,0.0,0.0,10.0,10.0,-10.0,0.0,-10.0,...,10.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,-10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,10.0,0.0,0.0,-10.0,0.0,0.0,0.0,0.0
5,10.0,0.0,10.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0


In [227]:
pivot_plays

content_id,0,3,7,8,19,21,34,37,39,43,...,460,465,466,478,480,483,486,487,491,496
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [228]:
plays_matrix = np.zeros((5, 500))

cols = list(pivot_plays.columns)

list(pivot_plays.index)

users = [i -1 for i in list(pivot_plays.index)]

users

[0, 1, 2, 3, 4]

In [230]:
plays_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [231]:
unstacked

content_id,0,1,3,4,5,7,8,9,13,16,...,480,483,486,487,491,492,493,494,496,498
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,-10.0,0.0,0.0,-10.0,0.0,0.0,0.0,-10.0,0.0,...,-10.0,0.0,0.0,0.0,10.0,0.0,-10.0,-10.0,10.0,-10.0
2,0.0,0.0,0.0,0.0,0.0,10.0,10.0,-10.0,0.0,-10.0,...,10.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,-10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,10.0,0.0,0.0,-10.0,0.0,0.0,0.0,0.0
5,10.0,0.0,10.0,-10.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,10.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0


In [232]:
likes_matrix = np.zeros((5, 500))

cols = list(unstacked.columns)

for i in users:
    for j in cols:
        likes_matrix[i, j] = unstacked.loc[i +1, j]

In [234]:
plays_matrix = plays_matrix * 0.3

plays_matrix

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0.3, 0. , 0. , ..., 0. , 0. , 0. ]])

In [235]:
plays = np.array(pivot_plays)

likes = np.array(unstacked)

In [236]:
likes_matrix

array([[  0., -10.,   0., ...,   0., -10.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [  0., -10.,   0., ...,   0.,   0.,   0.],
       [  0.,   0.,   0., ...,   0.,   0.,   0.],
       [ 10.,   0.,   0., ...,   0.,   0.,   0.]])

In [237]:
scores = plays_matrix + likes_matrix

scores

array([[  0. , -10. ,   0. , ...,   0. , -10. ,   0. ],
       [  0. ,   0. ,   0. , ...,   0. ,   0. ,   0. ],
       [  0. , -10. ,   0. , ...,   0. ,   0. ,   0. ],
       [  0. ,   0. ,   0. , ...,   0. ,   0. ,   0. ],
       [ 10.3,   0. ,   0. , ...,   0. ,   0. ,   0. ]])

In [189]:
np.unique(scores)

array([-2.,  0.,  2.,  4.])

In [190]:
scores_df = pd.DataFrame(scores, columns = [i for i in range(500)])

In [191]:
scores_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,-0.0,-0.0,0.0,2.0,0.0,-0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,-0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,0.0,0.0,2.0,-0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [198]:
scores_v2 = np.where(scores < 0, -5, scores)

np.unique(scores_v2)

array([-5.,  0.,  2.,  4.])

In [197]:
temp = pd.DataFrame(np.where(scores < 0, -5, scores))

temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,-0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,-0.0,-0.0,0.0,2.0,0.0,-0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,-0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2.0,0.0,0.0,2.0,-0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
np.unique()

In [77]:
pivot_rating = latest.pivot_table(index='user_id', columns = 'content_id', values = 'activity')

pivot_rating

  pivot_rating = latest.pivot_table(index='user_id', columns = 'content_id', values = 'activity')


content_id
user_id
1
2
3
4
5


In [None]:
df_activities['content_id']

In [42]:
df_activities

Unnamed: 0,content_id,activity,user_id,datetime
0,374,Dislike,2,2023-03-19 13:39:37.899777
1,366,Like,3,2023-03-19 13:39:37.900176
2,316,Like,4,2023-03-19 13:39:37.900207
3,167,Dislike,2,2023-03-19 13:39:37.900227
4,22,Play,3,2023-03-19 13:39:37.900244
...,...,...,...,...
495,424,Play,2,2023-03-19 13:39:37.912798
496,355,Play,5,2023-03-19 13:39:37.912815
497,83,Dislike,4,2023-03-19 13:39:37.912844
498,247,Play,2,2023-03-19 13:39:37.912863


In [43]:
df_activities.dtypes

content_id             int64
activity              object
user_id                int64
datetime      datetime64[ns]
dtype: object

In [38]:
df_activities.sort_values(by = 'datetime')


TypeError: '<' not supported between instances of 'str' and 'Timestamp'

In [35]:
df_act

Unnamed: 0,content_id,activity,user_id,datetime
0,374,Dislike,2,2023-03-19 13:39:37.899777
1,366,Like,3,2023-03-19 13:39:37.900176
2,316,Like,4,2023-03-19 13:39:37.900207
3,167,Dislike,2,2023-03-19 13:39:37.900227
4,22,Play,3,2023-03-19 13:39:37.900244
...,...,...,...,...
496,355,Play,5,2023-03-19 13:39:37.912815
497,83,Dislike,4,2023-03-19 13:39:37.912844
498,247,Play,2,2023-03-19 13:39:37.912863
499,488,Play,3,2023-03-19 13:39:37.912892


In [66]:
df_activities.loc[len(df_activities)] = [374, "Like", 2, "2023-03-19 13:39:37.899778"]

TypeError: unhashable type: 'list'

In [8]:
Likes = df_activities[df_activities['activity'] == 'Like']

In [12]:
grouped_likes = Likes.groupby(['user_id', 'content_id']).count()

In [28]:
pivot_likes = grouped_likes.pivot_table(index='user_id', columns = 'content_id', values = 'activity')

In [25]:
pivot_plays

content_id,0,3,7,8,19,21,34,37,39,43,...,460,465,466,478,480,483,486,487,491,496
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,1.0,,,,1.0,,...,1.0,1.0,,1.0,,,,,1.0,1.0
2,,,1.0,1.0,,,,,,1.0,...,,,1.0,,1.0,,,1.0,,
3,,,,,,1.0,1.0,,,,...,,,,1.0,,,,,,
4,,,,,,,,1.0,,,...,,1.0,,,,,1.0,,,
5,1.0,1.0,,,,,,,,,...,,,,,,1.0,,1.0,,


In [29]:
Dislikes = df_activities[df_activities['activity'] == 'Dislike']