SVD approach

In [787]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [788]:
from google.cloud import firestore

# We should create a dictionary, and to have for exampel dict[userId] = [itemInteractions, postedItems, searchHistory], where
# itemInteractions, postedItems, searchHistory are arrays of strings

# Get all documents from USER_PREFERENCES collection
db = firestore.Client(project='swapify-e426d')

collection_name = 'USER_PREFERENCES'
docs = db.collection(collection_name).stream()

# Process Firestore data
user_and_preference_data = []
for doc in docs:
    user_id = doc.id
    data = doc.to_dict()
    preferences = set(data.get('itemInteractions', []) + data.get('postedItems', []) + data.get('searchHistory', []))
    for preference in preferences:
        user_and_preference_data.append([user_id, preference])

In [789]:
# Print data
print(user_and_preference_data)

[['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Shoes'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Vand Tricou cu FC Liverpool, Fernando Torres 9, Marimea M, Livrare prin curier sau meet up in Bucuresti'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'tricouri fotbal\n'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Vand pantofi Nike, Marimea 39, Astept mesaje in privat. Nu trimit pe curier'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Funko Pops and Figures'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Tricou Liverpool Torres'], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Buna! Vreau sa fac trade cu alte Funko pentru ce vedeti in imagine. PM '], ['gxRY52w1CMhuzG8fwkPNuiO1FER2', 'Retro kits']]


In [790]:
# Convert to dataframe
df = pd.DataFrame(user_and_preference_data, columns=['user_id', 'preference'])

user_encoder = LabelEncoder()
preference_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['preference'] = preference_encoder.fit_transform(df['preference'])

# Make interaction matrix from dataframe (user_id, preference)
interaction_matrix = df.pivot_table(index='user_id', columns='preference', aggfunc=len, fill_value=0)
print(interaction_matrix)

preference  0  1  2  3  4  5  6  7
user_id                           
0           1  1  1  1  1  1  1  1


In [791]:
from scipy.sparse import csr_matrix

# Make sparse matrix from interaction matrix (pairs of user_id, preference and their number of interactions)
sparse_interaction_matrix = csr_matrix(interaction_matrix.values)
print(sparse_interaction_matrix)

  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1


In [792]:
from collections import Counter

# Get the interactions and their count, so we can make like a rating system based on the number of interactions
docs = db.collection(collection_name).stream()

user_and_preferences_counter = Counter()

for doc in docs:
    user_id = doc.id
    data = doc.to_dict()
    preferences = data.get('itemInteractions', []) + data.get('postedItems', []) + data.get('searchHistory', [])
    for preference in preferences:
        user_and_preferences_counter[(user_id, preference)] += 1

user_and_preference_data_with_count = [[user_id, item, count] for (user_id, item), count in user_and_preferences_counter.items()]
df_with_count = pd.DataFrame(user_and_preference_data_with_count, columns=['user_id', 'preference', 'count'])

df_with_count['user_id'] = user_encoder.fit_transform(df_with_count['user_id'])
df_with_count['preference'] = preference_encoder.fit_transform(df_with_count['preference'])

interaction_matrix_with_count = df_with_count.pivot_table(index='user_id', columns='preference', values='count', fill_value=0)
print(interaction_matrix_with_count)

sparse_interaction_matrix_with_count = csr_matrix(interaction_matrix_with_count.values)
print(sparse_interaction_matrix_with_count)

preference    0    1    2    3    4    5    6    7
user_id                                           
0           1.0  2.0  6.0  1.0  3.0  5.0  1.0  1.0
  (0, 0)	1.0
  (0, 1)	2.0
  (0, 2)	6.0
  (0, 3)	1.0
  (0, 4)	3.0
  (0, 5)	5.0
  (0, 6)	1.0
  (0, 7)	1.0


In [793]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split


In [794]:
from scipy.sparse import coo_matrix

# Convert the dataset into the specific format that Surprise uses
coo = coo_matrix(sparse_interaction_matrix_with_count)
df_for_surprise = pd.DataFrame({'user_id': coo.row, 'preference': coo.col, 'rating': coo.data})
df_for_surprise = df_for_surprise[df_for_surprise['rating'] > 0]

# Load the dataset into Surprise
reader = Reader(rating_scale=(1, df_for_surprise['rating'].max()))
data = Dataset.load_from_df(df_for_surprise[['user_id', 'preference', 'rating']], reader)

# Split the dataset into train and test
trainset, testset = train_test_split(data, test_size=0.2)

# Train the model
algo = SVD()
algo.fit(trainset)

# Predict the rating for a specific user and item
predictions = algo.test(testset)
print(predictions)

[Prediction(uid=0, iid=0, r_ui=1.0, est=2.3414925540526372, details={'was_impossible': False}), Prediction(uid=0, iid=5, r_ui=5.0, est=2.3414925540526372, details={'was_impossible': False})]


In [795]:
from surprise import accuracy

accuracy.rmse(predictions)

RMSE: 2.1056


2.105619162234263

TF-IDF and cosine similarity approach

In [796]:
# Preprocess the data for the model

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, MWETokenizer
import re
from gensim.models.phrases import Phrases, Phraser

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('romanian'))
custom_stop_words = {'trimit', 'curier', 'livrare', 'privat', 'mesaj', 'contact', 'detalii', 
                     'telefon', 'meet', 'up', 'pm', 'predare', 'personala', 'f2f', 'mesaje', 'tara', 'bucuresti',
                     'sigilat', 'marimea', 'astept', 'buna', 'vreau', 'sa', 'vand', 'schimb', 'schimburi', 'schimbare',
                     'fac', 'vedeti'}
stop_words = stop_words.union(custom_stop_words)

print(stop_words)

tokenized_preferences = [word_tokenize(preference.lower()) for preference in preference_encoder.classes_]
phrases = Phrases(tokenized_preferences, min_count=5, threshold=0.5, scoring='npmi')
phraser = Phraser(phrases)
    
def preprocess_text(text):
    # No longer removing numbers
    text = re.sub(r'http\S+|www\S+', '', text)
    tokens = word_tokenize(text)
    tokens = phraser[tokens]
    processed_tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.lower() not in stop_words and word.isalpha()]
    return ' '.join(processed_tokens)

preprocessed_preferences = [preprocess_text(preference) for preference in preference_encoder.classes_]
print(preprocessed_preferences)

{'pina', 'treia', 'treilea', 'deja', 'cineva', 'acea', 'oricine', 'unul', 'căci', 'vi', 'aceea', 'altfel', 'vreo', 'insa', 'sint', 'atunci', 'schimb', 'pe', 'vedeti', 'pot', 'mult', 'personala', 'mea', 'i', 'mesaj', 'deasupra', 'acei', 'avea', 'am', 'atita', 'alti', 'face', 'multe', 'vom', 'predare', 'îţi', 'aş', 'până', 'această', 'sai', 'u', 'ti', 'buna', 'aceştia', 'lîngă', 'lor', 'acesta', 'as', 'schimburi', 'abia', 'doi', 'spre', 'cu', 'un', 'ţi', 'in', 'mod', 'asta', 'ţie', 'oricînd', 'acele', 'multa', 'ei', 'nouă', 'acelea', 'toata', 'foarte', 'citiva', 'trimit', 'totusi', 'dă', 't', 'sale', 'ceilalti', 'nou', 'vor', 'acestia', 'desi', 'puţina', 'atitia', 'ălea', 'mulţi', 'ta', 'dintr-', 'altceva', 'mi', 'una', 'unii', 'ci', 'suntem', 'ce', 'cărei', 'de', 'o', 'doar', 'îl', 'tăi', 'tău', 'iar', 'daca', 'caruia', 'voi', 'atare', 'voştri', 'ia', 'ii', 'celor', 'meet', 'unde', 'în', 'up', 'le', 'fara', 'se', 'noştri', 'cita', 'fac', 'ori', 'zice', 'da', 'eşti', 'prea', 'este', 'alt

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stoic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stoic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\stoic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [797]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [798]:
# We take all the queries/items/interactions from the dataset
all_preference_interactions = list(set([preference for _, preference in user_and_preference_data]))
print(all_preference_interactions)
preprocessed_all_preference_interactions = [preprocess_text(preference) for preference in all_preference_interactions]
print(preprocessed_all_preference_interactions)

# We create a TF-IDF model
tfidf = TfidfVectorizer()
tfidf_model = tfidf.fit(preprocessed_all_preference_interactions)

['Shoes', 'Vand Tricou cu FC Liverpool, Fernando Torres 9, Marimea M, Livrare prin curier sau meet up in Bucuresti', 'tricouri fotbal\n', 'Vand pantofi Nike, Marimea 39, Astept mesaje in privat. Nu trimit pe curier', 'Funko Pops and Figures', 'Tricou Liverpool Torres', 'Buna! Vreau sa fac trade cu alte Funko pentru ce vedeti in imagine. PM ', 'Retro kits']
['shoe', 'tricou fc liverpool fernando torres', 'tricouri fotbal', 'pantofi nike', 'funko pop and figure', 'tricou liverpool torres', 'trade funko imagine', 'retro kit']


In [799]:
def get_preference_string(preference_id):
    return preference_encoder.inverse_transform([preference_id])[0]

In [800]:
user_profiles = {}

for user_id in set(df['user_id']):
    # Get preferences and counts for the user
    user_preferences = df_with_count[df_with_count['user_id'] == user_id]
    user_preferences_tfidf = np.zeros((tfidf_model.transform(['']).shape[1],))
    
    # Loop through each preference and add its weighted TF-IDF vector
    for _, row in user_preferences.iterrows():
        preference = get_preference_string(row['preference'])
        count = row['count']
        tfidf_vector = tfidf_model.transform([preprocess_text(preference)]).toarray()
        user_preferences_tfidf += tfidf_vector[0] * count
    
    # Normalize the TF-IDF vector by the total count of interactions
    total_count = user_preferences['count'].sum()
    if total_count > 0:
        user_preferences_tfidf /= total_count
    
    user_profiles[user_id] = user_preferences_tfidf

print(user_profiles)

{0: array([0.05197085, 0.12335907, 0.12335907, 0.05197085, 0.03535534,
       0.06904632, 0.03041566, 0.21213203, 0.18998707, 0.03535534,
       0.03535534, 0.05197085, 0.21213203, 0.05      , 0.18998707,
       0.03041566, 0.18998707, 0.03535534])}


In [801]:
new_item_name = "Tricou Fotbal Ronaldo"
new_item_category = "Retro kits"
new_item_description = "Tricou Fotbal Ronaldo Real Madrid sezonul 2018-2019, marimea L, culoare albastru, trimit si in tara"

new_item_text = new_item_name + " " + new_item_category + " " + new_item_description
preprocessed_new_item_text = preprocess_text(new_item_text)

print(preprocessed_new_item_text)
new_item_vector = tfidf.transform([preprocessed_new_item_text])

tricou fotbal ronaldo retro kit tricou fotbal ronaldo real madrid sezonul l culoare albastru


In [802]:
new_item_name2 = "Funko Pop! Star Wars: The Mandalorian - The Child"
new_item_category2 = "Funko Pops and Figures"
new_item_description2 = "Funko Pop! Star Wars: The Mandalorian - The Child, nou, sigilat, trimit si in tara"

new_item_text2 = new_item_name2 + " " + new_item_category2 + " " + new_item_description2
preprocessed_new_item_text2 = preprocess_text(new_item_text2)

print(preprocessed_new_item_text2)
new_item_vector2 = tfidf.transform([preprocessed_new_item_text2])

funko pop star war the mandalorian the child funko pop and figure funko pop star war the mandalorian the child


In [803]:
from sklearn.metrics.pairwise import cosine_similarity

In [804]:
user_interest_predictions = {}

for user_id, profile in user_profiles.items():
    similarity = cosine_similarity([profile], new_item_vector)
    user_interest_predictions[user_id] = similarity[0][0]

print(user_interest_predictions)

{0: 0.547085414066346}


In [805]:
user_interest_predictions2 = {}

for user_id, profile in user_profiles.items():
    similarity = cosine_similarity([profile], new_item_vector2)
    user_interest_predictions2[user_id] = similarity[0][0]
    
print(user_interest_predictions2)

{0: 0.20790510890286867}
