In [15]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from collections import defaultdict

In [16]:
data = pd.read_csv('dataset/sample_df.csv')

In [17]:
data

Unnamed: 0,item,user,rating,timestamp
0,0972683275,A1MNILX1J1NWV7,4.0,1464825600
1,0972683275,A1JPB4BJHNCSWR,5.0,1409184000
2,0972683275,A2A3B0JX2JRS6S,5.0,1379462400
3,0972683275,A14O9P3DV5EHKO,3.0,1359331200
4,0972683275,A2BLGQ2SCSKCCY,5.0,1358812800
...,...,...,...,...
111242,B01H4OYJNI,A2QIFKUBK053ZC,5.0,1531008000
111243,B01H4YTHWG,A2SOSDOBI6AHJI,5.0,1537920000
111244,B01H4ZLZLQ,A3CHC46D65IH4H,4.0,1536019200
111245,B01HGV7M7Y,A1F7KIIWHPQ5TG,5.0,1537142400


In [18]:
df = data.copy()

In [40]:
df.item.nunique()

13283

In [19]:
unique_users = data['user'].unique()
unique_items = data['item'].unique()

In [20]:
user_dict = defaultdict(int)
item_dict = defaultdict(int)
for i, user in enumerate(unique_users):
    user_dict[user] = i
    
for i, item in enumerate(unique_items):
    item_dict[item] = i

In [23]:
historic_users = []

for i, user in enumerate(unique_users):
    temp = df[df['user'] == user]
    temp = temp.sort_values('timestamp').reset_index(drop=True)
    historic_users.append(temp)

for user in historic_users:
    user['timestamp'] += user.index

In [39]:
historic_users[1]

Unnamed: 0,item,user,rating,timestamp
0,B00IVPU7AO,A1JPB4BJHNCSWR,5.0,1408492800
1,B00BGUG9DK,A1JPB4BJHNCSWR,5.0,1408492801
2,B0002EOFFK,A1JPB4BJHNCSWR,1.0,1408492802
3,B003XM9774,A1JPB4BJHNCSWR,5.0,1409184003
4,B004G605Q8,A1JPB4BJHNCSWR,4.0,1409184004
5,0972683275,A1JPB4BJHNCSWR,5.0,1409184005
6,B00DR0RBV6,A1JPB4BJHNCSWR,3.0,1421020806
7,B017A62ZUO,A1JPB4BJHNCSWR,5.0,1464825607
8,B00SI67YRU,A1JPB4BJHNCSWR,4.0,1481241608
9,B016JREG84,A1JPB4BJHNCSWR,5.0,1481241609


In [37]:
user_movies = defaultdict(list) #list of rated movies by each user
for user_df in historic_users:
    user = user_df['user'][0]
    user_movies[user] = [item_dict[x] for x in user_df['item']]

In [47]:
x=np.zeros((1, 13283))
x.shape

(1, 13283)

In [45]:
max(sorted(list(item_dict.values()))) + 1

13283

In [83]:
list(item_dict.values())[-1] + 1

13282

In [50]:
user_id = 'A1JPB4BJHNCSWR'
user_movies_count = len(user_movies[user_id])
user_movies_count

33

In [53]:
random_index = np.random.randint(0, user_movies_count-1) 
random_index

10

In [55]:
target = np.zeros((1, 13283))
target[0][user_movies[user_id][random_index]] = 1

In [58]:
print(user_movies[user_id][random_index])

896


In [61]:
print(target[0][896])

1.0


In [70]:
context = np.zeros((1, 13283))

In [71]:
context[0][user_movies[user_id][:random_index] + user_movies[user_id][random_index+1:]] = 1

In [74]:
print(user_movies[user_id][:random_index]) 

[8005, 5770, 472, 2536, 2860, 0, 6506, 12009, 10047, 11888]


In [75]:
print(user_movies[user_id][random_index+1:])

[10396, 756, 7943, 2803, 3462, 13100, 11335, 6145, 7136, 4858, 5641, 2801, 2109, 1841, 705, 5747, 4745, 960, 5938, 8405, 1166, 622]


In [77]:
context[0][756]

1.0

In [None]:
batch = [self.generate_input(user_id=np.random.choice(self.train_users) - 1) for _ in range(batch_size)]
X_train = np.array([b[0] for b in batch])
y_train = np.array([b[1] for b in batch])

In [None]:
class Embeddings:
    def __init__(self, data, train_test_ratio):
        self.data = data
        # list of unique users
        self.unique_users = data['user'].unique()
        # list of unique items
        self.unique_items = data['item'].unique()
        # count of unique items
        self.unique_items_count = data['item'].nunique()
        # mapping of user_id to integer
        self.users_dict = self.create_item_index_mapping(self.unique_users)
        # mapping of item_id to integer
        self.items_dict = self.create_item_index_mapping(self.unique_items)
        # list of dataframes of users' sorted rating history
        self.user_history_arr = self.create_history(self.data, self.unique_users)
        # number of training users
        total_training_users = int(train_test_ratio * len(user_history_arr))
        # train_test_split
        self.train_users = self.user_history_arr[:total_training_users]
        self.test_users = self.user_history_arr[total_training_users:]
        # dictionary of users having ratings sorted by timestamp
        self.user_ratings_dict = self.create_user_rating_list(self.user_history_arr)
    
    def create_item_index_mapping(arr):
        t_dict = defaultdict(int)
        for i, val in enumerate(arr):
            t_dict[val] = i
        return t_dict
    
    def create_user_rating_list(historic_users):
        user_ratings = defaultdict(list) #list of rated movies by each user
        for user_df in historic_users:
            user = user_df['user'][0]
            user_ratings[user] = [self.items_dict[x] for x in user_df['item']]

    def create_history(df, unique_users):
        historic_users = []

        for i, user in enumerate(unique_users):
            temp = df[df['user'] == user]
            temp = temp.sort_values('timestamp').reset_index(drop=True)
            historic_users.append(temp)
        # TO ACCOUNT FOR EQUAL TIMESTAMP
        for user in historic_users:
            user['timestamp'] += user.index
        return historic_users
    
    def create_input(self, user_id):
        user_ratings_count = len(self.user_ratings_dict[user_id])
        random_index = np.random.randint(0, user_ratings_count - 1)
        target = np.zeros((1, self.unique_items_count))
        target[0][self.user_ratings_dict[user_id][random_index]] = 1
        
        context = np.zeros((1, self.unique_items_count))
        target[0][self.user_ratings_dict[user_id][:random_index] + self.user_ratings_dict[user_id][random_index+1]] = 1
        
        return context, target
    
    def model(self, hidden_layer_size=512):
        m = Sequential()
        m.add(Dense(hidden_layer_size, input_shape=(1, self.unique_items_count)))
        m.add(Dropout(0.1))
        m.add(Dense(hidden_layer_size//2))
        m.add(Dropout(0.2))
        m.add(Dense(self.unique_items_count, activation='softmax'))
        m.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy', 'loss'])
        return m

In [80]:
historic_users[1]

Unnamed: 0,item,user,rating,timestamp
0,B00IVPU7AO,A1JPB4BJHNCSWR,5.0,1408492800
1,B00BGUG9DK,A1JPB4BJHNCSWR,5.0,1408492801
2,B0002EOFFK,A1JPB4BJHNCSWR,1.0,1408492802
3,B003XM9774,A1JPB4BJHNCSWR,5.0,1409184003
4,B004G605Q8,A1JPB4BJHNCSWR,4.0,1409184004
5,0972683275,A1JPB4BJHNCSWR,5.0,1409184005
6,B00DR0RBV6,A1JPB4BJHNCSWR,3.0,1421020806
7,B017A62ZUO,A1JPB4BJHNCSWR,5.0,1464825607
8,B00SI67YRU,A1JPB4BJHNCSWR,4.0,1481241608
9,B016JREG84,A1JPB4BJHNCSWR,5.0,1481241609


In [79]:
user_dict

defaultdict(int,
            {'A1MNILX1J1NWV7': 0,
             'A1JPB4BJHNCSWR': 1,
             'A2A3B0JX2JRS6S': 2,
             'A14O9P3DV5EHKO': 3,
             'A2BLGQ2SCSKCCY': 4,
             'A21OS6PWQG5YOO': 5,
             'AYKXABHFBFGJX': 6,
             'A39KBWLS6TURMR': 7,
             'A6J8D9V5S9MBE': 8,
             'AUGBCGUG05AH7': 9,
             'A1HQTQQXV5P2UG': 10,
             'A3B75JRYE07CH': 11,
             'A3IQR99TNNWNKF': 12,
             'A2V90RN1G4M11V': 13,
             'A36QF2CX91ONJC': 14,
             'A2RFAKR9FU6VGT': 15,
             'A3ISJB3FVYHL1U': 16,
             'A2KLSFIQUUNTJ8': 17,
             'A3923UGIPX0QQM': 18,
             'ACZP9RHD3W7GO': 19,
             'A2ICKE6A3MJMM5': 20,
             'A6VFSC0BW75Y0': 21,
             'ALEG9QCIUWK81': 22,
             'A1D4O9JOOP7S3C': 23,
             'AV6BQVVC1XE4E': 24,
             'A3MDA891A1EPT8': 25,
             'ADKYANLVMPKSC': 26,
             'A2E80ZZ6Y8ZJT1': 27,
             'A356YCP3