In [277]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [278]:
from google.cloud import firestore

# We should create a dictionary, and to have for exampel dict[userId] = [itemInteractions, postedItems, searchHistory], where
# itemInteractions, postedItems, searchHistory are arrays of strings

# Get all documents from USER_PREFERENCES collection
db = firestore.Client(project='swapify-e426d')

collection_name = 'USER_PREFERENCES'
docs = db.collection(collection_name).stream()

# Process Firestore data
user_item_data = []
for doc in docs:
    user_id = doc.id
    data = doc.to_dict()
    items = set(data.get('itemInteractions', []) + data.get('postedItems', []) + data.get('searchHistory', []))
    for item in items:
        user_item_data.append([user_id, item])

In [279]:
# Print data
print(user_item_data)

[['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Retro kits'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Tricou Liverpool Torres'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Funko Pops and Figures'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Vand Tricou cu FC Liverpool, Fernando Torres 9, Marimea M, Livrare prin curier sau meet up in Bucuresti'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Buna! Vreau sa fac trade cu alte Funko pentru ce vedeti in imagine. PM '], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'tricouri fotbal\n'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Shoes'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Vand pantofi Nike, Marimea 39, Astept mesaje in privat. Nu trimit pe curier']]


In [280]:
# Convert to dataframe
df = pd.DataFrame(user_item_data, columns=['user_id', 'item'])

user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['item'] = item_encoder.fit_transform(df['item'])

interaction_matrix = df.pivot_table(index='user_id', columns='item', aggfunc=len, fill_value=0)
print(interaction_matrix)

item     0  1  2  3  4  5  6  7
user_id                        
0        1  1  1  1  1  1  1  1


In [281]:
from scipy.sparse import csr_matrix

sparse_interaction_matrix = csr_matrix(interaction_matrix.values)
print(sparse_interaction_matrix)

  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1


In [282]:
from collections import Counter

collection_name = 'USER_PREFERENCES'
docs = db.collection(collection_name).stream()

user_item_counter = Counter()

for doc in docs:
    user_id = doc.id
    data = doc.to_dict()
    items = data.get('itemInteractions', []) + data.get('postedItems', []) + data.get('searchHistory', [])
    for item in items:
        user_item_counter[(user_id, item)] += 1

user_item_data_with_count = [[user_id, item, count] for (user_id, item), count in user_item_counter.items()]
df_with_count = pd.DataFrame(user_item_data_with_count, columns=['user_id', 'item', 'count'])

df_with_count['user_id'] = user_encoder.fit_transform(df_with_count['user_id'])
df_with_count['item'] = item_encoder.fit_transform(df_with_count['item'])

interaction_matrix_with_count = df_with_count.pivot_table(index='user_id', columns='item', values='count', fill_value=0)
print(interaction_matrix_with_count)

sparse_interaction_matrix_with_count = csr_matrix(interaction_matrix_with_count.values)
# print(sparse_interaction_matrix_with_count)

item       0    1    2    3    4    5    6    7
user_id                                        
0        1.0  2.0  4.0  1.0  1.0  3.0  1.0  1.0


In [283]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split


In [284]:
from scipy.sparse import coo_matrix

# Convert the dataset into the specific format that Surprise uses
coo = coo_matrix(sparse_interaction_matrix_with_count)
df_for_surprise = pd.DataFrame({'user_id': coo.row, 'item': coo.col, 'rating': coo.data})
df_for_surprise = df_for_surprise[df_for_surprise['rating'] > 0]

# Load the dataset into Surprise
reader = Reader(rating_scale=(1, df_for_surprise['rating'].max()))
data = Dataset.load_from_df(df_for_surprise[['user_id', 'item', 'rating']], reader)

# Split the dataset into train and test
trainset, testset = train_test_split(data, test_size=0.2)

# Train the model
algo = SVD()
algo.fit(trainset)

# Predict the rating for a specific user and item
predictions = algo.test(testset)
print(predictions)

[Prediction(uid=0, iid=2, r_ui=4.0, est=1.5076692496020259, details={'was_impossible': False}), Prediction(uid=0, iid=6, r_ui=1.0, est=1.5076692496020259, details={'was_impossible': False})]


In [285]:
from surprise import accuracy

accuracy.rmse(predictions)

RMSE: 1.7985


1.798532823772034

In [286]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [287]:
all_item_interactions = list(set([item for _, item in user_item_data]))

tfidf = TfidfVectorizer()
tfidf_model = tfidf.fit(all_item_interactions)

In [288]:
def get_item_string(item_id):
    return item_encoder.inverse_transform([item_id])[0]


In [289]:
user_profiles = {}

for user_id in set(df['user_id']):
    user_items_ids = df[df['user_id'] == user_id]['item']
    user_items_strings = [get_item_string(item_id) for item_id in user_items_ids]
    user_items_tfidf = tfidf_model.transform(user_items_strings)
    user_profiles[user_id] = np.mean(user_items_tfidf, axis=0).A1

print(user_profiles)

{0: array([0.03665369, 0.0347655 , 0.06496356, 0.03665369, 0.03409701,
       0.0347655 , 0.0347655 , 0.05771216, 0.05929461, 0.0347655 ,
       0.03409701, 0.03409701, 0.06496356, 0.08838835, 0.08358073,
       0.0347655 , 0.07630853, 0.08838835, 0.10074474, 0.03409701,
       0.05929461, 0.03409701, 0.03665369, 0.03665369, 0.03665369,
       0.03665369, 0.03665369, 0.0347655 , 0.0347655 , 0.06496356,
       0.03409701, 0.03665369, 0.08838835, 0.0347655 , 0.03409701,
       0.125     , 0.10074474, 0.0347655 , 0.10074474, 0.08838835,
       0.03665369, 0.03409701, 0.05929461, 0.0347655 , 0.0347655 ])}


In [290]:
new_item_name = "Tricou Fotbal Ronaldo"
new_item_category = "Retro kits"
new_item_description = "Tricou Fotbal Ronaldo Real Madrid sezonul 2018-2019, marimea L, culoare albastru, trimit si in tara"

new_item_text = new_item_name + " " + new_item_category + " " + new_item_description
new_item_vector = tfidf.transform([new_item_text])


In [291]:
from sklearn.metrics.pairwise import cosine_similarity

In [292]:
user_interest_predictions = {}

for user_id, profile in user_profiles.items():
    similarity = cosine_similarity([profile], new_item_vector)
    user_interest_predictions[user_id] = similarity[0][0]

print(user_interest_predictions)
    

{0: 0.5137456526032076}
