SVD approach

In [18]:
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [19]:
from google.cloud import firestore

# We should create a dictionary, and to have for exampel dict[userId] = [itemInteractions, postedItems, searchHistory], where
# itemInteractions, postedItems, searchHistory are arrays of strings

# Get all documents from USER_PREFERENCES collection
db = firestore.Client(project='swapify-e426d')

collection_name = 'USER_PREFERENCES'
docs = db.collection(collection_name).stream()

# Process Firestore data
user_and_preference_data = []
for doc in docs:
    user_id = doc.id
    data = doc.to_dict()
    preferences = set(data.get('itemInteractions', []) + data.get('postedItems', []) + data.get('searchHistory', []))
    for preference in preferences:
        user_and_preference_data.append([user_id, preference])

In [20]:
# Print data
print(user_and_preference_data)

[['Pece4uaUoFeRxlKegA19vRpaUhO2', 'shoes'], ['Pece4uaUoFeRxlKegA19vRpaUhO2', 'Funko Pops Stranger Things'], ['Pece4uaUoFeRxlKegA19vRpaUhO2', 'Tricou Alb Simplu'], ['Pece4uaUoFeRxlKegA19vRpaUhO2', 'Funko Pops and Figures'], ['Pece4uaUoFeRxlKegA19vRpaUhO2', 'Tricou Borussia Dortmund'], ['Pece4uaUoFeRxlKegA19vRpaUhO2', 'Bucharest'], ['Pece4uaUoFeRxlKegA19vRpaUhO2', 'Others'], ['Pece4uaUoFeRxlKegA19vRpaUhO2', 'Retro kits'], ['Pece4uaUoFeRxlKegA19vRpaUhO2', 'Pantofi Nike Air Max'], ['Pece4uaUoFeRxlKegA19vRpaUhO2', 'Shoes'], ['RtukkaH4z0VxgMOd8ePBpR425ro2', 'se vinde, 80 de lei, doar f2f in Galati'], ['RtukkaH4z0VxgMOd8ePBpR425ro2', 'Vreau sa ofer acest tricou cuiva care are nevoie de el. Am multe astfel de produse, astept mesaje pentru doritori. Banii stransi vor fi donati unei cauze, multumesc frumos. Astept mesaj in privat pentru detalii!'], ['RtukkaH4z0VxgMOd8ePBpR425ro2', 'Vand tricou retro cu borussia dortmund, original, cu eticheta, marimea M. Trimit in tara sau f2f in Brasov.'], ['Rt

In [21]:
# Convert to dataframe
df = pd.DataFrame(user_and_preference_data, columns=['user_id', 'preference'])

user_encoder = LabelEncoder()
preference_encoder = LabelEncoder()

df['user_id'] = user_encoder.fit_transform(df['user_id'])
df['preference'] = preference_encoder.fit_transform(df['preference'])

# Make interaction matrix from dataframe (user_id, preference)
interaction_matrix = df.pivot_table(index='user_id', columns='preference', aggfunc=len, fill_value=0)
print(interaction_matrix)

preference  0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15  16
user_id                                                                       
0            0   1   1   1   1   1   1   1   1   1   0   0   0   0   0   1   0
1            0   0   0   0   0   0   0   0   0   0   1   1   1   1   1   0   0
2            1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1


In [22]:
from scipy.sparse import csr_matrix

# Make sparse matrix from interaction matrix (pairs of user_id, preference and their number of interactions)
sparse_interaction_matrix = csr_matrix(interaction_matrix.values)
print(sparse_interaction_matrix)

  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1
  (0, 9)	1
  (0, 15)	1
  (1, 10)	1
  (1, 11)	1
  (1, 12)	1
  (1, 13)	1
  (1, 14)	1
  (2, 0)	1
  (2, 16)	1


In [23]:
from collections import Counter

# Get the interactions and their count, so we can make like a rating system based on the number of interactions
docs = db.collection(collection_name).stream()

user_and_preferences_counter = Counter()

for doc in docs:
    user_id = doc.id
    data = doc.to_dict()
    preferences = data.get('itemInteractions', []) + data.get('postedItems', []) + data.get('searchHistory', [])
    for preference in preferences:
        user_and_preferences_counter[(user_id, preference)] += 1

user_and_preference_data_with_count = [[user_id, item, count] for (user_id, item), count in user_and_preferences_counter.items()]
df_with_count = pd.DataFrame(user_and_preference_data_with_count, columns=['user_id', 'preference', 'count'])

df_with_count['user_id'] = user_encoder.fit_transform(df_with_count['user_id'])
df_with_count['preference'] = preference_encoder.fit_transform(df_with_count['preference'])

interaction_matrix_with_count = df_with_count.pivot_table(index='user_id', columns='preference', values='count', fill_value=0)
print(interaction_matrix_with_count)

sparse_interaction_matrix_with_count = csr_matrix(interaction_matrix_with_count.values)
print(sparse_interaction_matrix_with_count)

preference   0    1    2    3    4    5    6    7    8    9    10   11   12  \
user_id                                                                       
0           0.0  1.0  4.0  4.0  8.0  3.0  2.0  3.0  8.0  2.0  0.0  0.0  0.0   
1           0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  1.0  1.0   
2           1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0   

preference   13   14   15   16  
user_id                         
0           0.0  0.0  1.0  0.0  
1           1.0  1.0  0.0  0.0  
2           0.0  0.0  0.0  3.0  
  (0, 1)	1.0
  (0, 2)	4.0
  (0, 3)	4.0
  (0, 4)	8.0
  (0, 5)	3.0
  (0, 6)	2.0
  (0, 7)	3.0
  (0, 8)	8.0
  (0, 9)	2.0
  (0, 15)	1.0
  (1, 10)	1.0
  (1, 11)	1.0
  (1, 12)	1.0
  (1, 13)	1.0
  (1, 14)	1.0
  (2, 0)	1.0
  (2, 16)	3.0


In [24]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split


In [25]:
from scipy.sparse import coo_matrix

# Convert the dataset into the specific format that Surprise uses
coo = coo_matrix(sparse_interaction_matrix_with_count)
df_for_surprise = pd.DataFrame({'user_id': coo.row, 'preference': coo.col, 'rating': coo.data})
df_for_surprise = df_for_surprise[df_for_surprise['rating'] > 0]

# Load the dataset into Surprise
reader = Reader(rating_scale=(1, df_for_surprise['rating'].max()))
data = Dataset.load_from_df(df_for_surprise[['user_id', 'preference', 'rating']], reader)

# Split the dataset into train and test
trainset, testset = train_test_split(data, test_size=0.2)

# Train the model
algo = SVD()
algo.fit(trainset)

# Predict the rating for a specific user and item
predictions = algo.test(testset)
print(predictions)

[Prediction(uid=0, iid=5, r_ui=3.0, est=2.5585533848167357, details={'was_impossible': False}), Prediction(uid=1, iid=10, r_ui=1.0, est=1.882102428870836, details={'was_impossible': False}), Prediction(uid=0, iid=8, r_ui=8.0, est=2.5585533848167357, details={'was_impossible': False}), Prediction(uid=0, iid=2, r_ui=4.0, est=2.5585533848167357, details={'was_impossible': False})]


In [26]:
from surprise import accuracy

accuracy.rmse(predictions)

RMSE: 2.8575


2.857450324125919

TF-IDF and cosine similarity approach

In [29]:
# Preprocess the data for the model

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize, MWETokenizer
import re
from nltk.collocations import BigramAssocMeasures, BigramCollocationFinder

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('romanian'))
custom_stop_words = {'trimit', 'curier', 'livrare', 'privat', 'mesaj', 'contact', 'detalii', 
                     'telefon', 'meet', 'up', 'pm', 'predare', 'personala', 'f2f', 'mesaje', 'tara', 'bucuresti',
                     'sigilat', 'marimea', 'astept', 'buna', 'vreau', 'sa', 'vand', 'schimb', 'schimburi', 'schimbare',
                     'fac', 'vedeti'}
stop_words = stop_words.union(custom_stop_words)

print(stop_words)

tokenized_preferences = [word_tokenize(preference.lower()) for preference in preference_encoder.classes_]

all_tokens = [token for sublist in tokenized_preferences for token in sublist]

bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(all_tokens)
finder.apply_freq_filter(5)
scored = finder.score_ngrams(bigram_measures.likelihood_ratio)

phrases = set([' '.join(bigram) for bigram, score in scored if score >= 0.5])

def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)
    tokens = word_tokenize(text.lower())
    tokens = [' '.join(tokens[i:i+2]) if ' '.join(tokens[i:i+2]) in phrases else tokens[i] for i in range(len(tokens)-1)]
    processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(processed_tokens)

preprocessed_preferences = [preprocess_text(preference) for preference in preference_encoder.classes_]
print(preprocessed_preferences)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stoic\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stoic\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\stoic\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


{'mulţi', 'nostri', 'atatia', 'curier', 'ţi', 'acela', 'deasupra', 'cine', 'ăla', 'oricând', 'către', 'sai', 'mie', 'îţi', 'alti', 'la', 'cea', 'tău', 'ceva', 'pe', 'prin', 'aceeasi', 'toata', 'în', 'pînă', 'lui', 'uneori', 'il', 'meet', 'dintre', 'care', 'oricât', 'astept', 'up', 'ele', 'alea', 'avut', 'cât', 'toti', 'acestea', 'ia', 'ul', 'sa', 'atat', 'mă', 'fac', 'v', 'pot', 'apoi', 'anume', 'ca', 'contact', 'caruia', 'pentru', 'fim', 'al', 'in', 'acest', 'până', 'altceva', 'suntem', 'vor', 'fiţi', 'patru', 'cite', 'vreun', 'ea', 'unde', 'fara', 'tine', 'eu', 'aceste', 'aveţi', 'deşi', 'fără', 'cita', 'mîine', 'cei', 'ălea', 'dintr-', 'or', 'inainte', 'mâine', 'aceia', 'multa', 'pai', 'livrare', 'catre', 'este', 'iti', 'detalii', 'de', 'te', 'doar', 'insa', 'voi', 'cînd', 'mesaje', 'fel', 'cit', 'mele', 'meu', 'pic', 'deja', 'sau', 'nou', 'si', 'voastră', 'acestei', 'deci', 'ni', 'nimeni', 'citi', 'ii', 'noastră', 'vand', 'asa', 'le', 'au', 'dă', 'tăi', 'mereu', 'imi', 'alt', 'se',

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
# We take all the queries/items/interactions from the dataset
all_preference_interactions = list(set([preference for _, preference in user_and_preference_data]))
print(all_preference_interactions)
preprocessed_all_preference_interactions = [preprocess_text(preference) for preference in all_preference_interactions]
print(preprocessed_all_preference_interactions)

# We create a TF-IDF model
tfidf = TfidfVectorizer()
tfidf_model = tfidf.fit(preprocessed_all_preference_interactions)

['shoes', 'Funko Pops Stranger Things', 'test test', 'Tricou Alb Simplu', 'Funko Pops and Figures', 'Tricou Borussia Dortmund', 'Bucharest', 'Others', 'se vinde, 80 de lei, doar f2f in Galati', 'Retro kits', 'Vreau sa ofer acest tricou cuiva care are nevoie de el. Am multe astfel de produse, astept mesaje pentru doritori. Banii stransi vor fi donati unei cauze, multumesc frumos. Astept mesaj in privat pentru detalii!', 'Accept trade pentru figurinele din imagine', 'Vand tricou retro cu borussia dortmund, original, cu eticheta, marimea M. Trimit in tara sau f2f in Brasov.', 'Pantofi Nike Air Max', 'Vreau sa ofer acest tricou pentru suma modica de 20 RON. Am foarte multe produse in stilul acesta, ieftine dar nepurtate. Vreau sa donez toti banii unei cauze. Astept mesaj pentru detalii!!!', 'Vand figurine Funko Pop, colectia Stranger Things. Fiecare figurina are pretul fix de 100 RON. Trimit in tara/F2F Galati', 'Shoes']
['', 'funko pop stranger', 'test', 'tricou alb', 'funko pop and', 'tr

In [32]:
def get_preference_string(preference_id):
    return preference_encoder.inverse_transform([preference_id])[0]

In [33]:
user_profiles = {}

for user_id in set(df['user_id']):
    # Get preferences and counts for the user
    user_preferences = df_with_count[df_with_count['user_id'] == user_id]
    user_preferences_tfidf = np.zeros((tfidf_model.transform(['']).shape[1],))
    
    # Loop through each preference and add its weighted TF-IDF vector
    for _, row in user_preferences.iterrows():
        preference = get_preference_string(row['preference'])
        count = row['count']
        tfidf_vector = tfidf_model.transform([preprocess_text(preference)]).toarray()
        user_preferences_tfidf += tfidf_vector[0] * count
    
    # Normalize the TF-IDF vector by the total count of interactions
    total_count = user_preferences['count'].sum()
    if total_count > 0:
        user_preferences_tfidf /= total_count
    
    user_profiles[user_id] = user_preferences_tfidf

print(user_profiles)

{0: array([0.        , 0.04811252, 0.18577687, 0.07445869, 0.        ,
       0.04440782, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.12001643, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.04811252, 0.        , 0.        ,
       0.04811252, 0.12001643, 0.        , 0.        , 0.05555556,
       0.        , 0.        , 0.06878857, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.15532337, 0.        ]), 1: array([0.        , 0.        , 0.        , 0.        , 0.10850944,
       0.07155947, 0.08195251, 0.10850944, 0.06760574, 0.06177835,
       0.06177835, 0.06249061, 0.06177835, 0.08195251, 0.08195251,
       0.06760574, 0.06760574, 0.        , 0.06760574, 0.06177835,
       0.05294905, 0.06249061, 0.14142136, 0.06249061, 0.06177835,
       0.06249061, 0.06177835, 0.        , 0.10850944

In [34]:
new_item_name = "Tricou Fotbal Ronaldo"
new_item_category = "Retro kits"
new_item_description = "Tricou Fotbal Ronaldo Real Madrid sezonul 2018-2019, marimea L, culoare albastru, trimit si in tara"

new_item_text = new_item_name + " " + new_item_category + " " + new_item_description
preprocessed_new_item_text = preprocess_text(new_item_text)

print(preprocessed_new_item_text)
new_item_vector = tfidf.transform([preprocessed_new_item_text])

tricou fotbal ronaldo retro kit tricou fotbal ronaldo real madrid sezonul l culoare albastru


In [35]:
new_item_name2 = "Funko Pop! Star Wars: The Mandalorian - The Child"
new_item_category2 = "Funko Pops and Figures"
new_item_description2 = "Funko Pop! Star Wars: The Mandalorian - The Child, nou, sigilat, trimit si in tara"

new_item_text2 = new_item_name2 + " " + new_item_category2 + " " + new_item_description2
preprocessed_new_item_text2 = preprocess_text(new_item_text2)

print(preprocessed_new_item_text2)
new_item_vector2 = tfidf.transform([preprocessed_new_item_text2])

funko pop star war the mandalorian the child funko pop and figure funko pop star war the mandalorian the child


In [36]:
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
user_interest_predictions = {}

for user_id, profile in user_profiles.items():
    similarity = cosine_similarity([profile], new_item_vector)
    user_interest_predictions[user_id] = similarity[0][0]

print(user_interest_predictions)

{0: 0.4833104464200692, 1: 0.30961076166630996, 2: 0.0}


In [38]:
user_interest_predictions2 = {}

for user_id, profile in user_profiles.items():
    similarity = cosine_similarity([profile], new_item_vector2)
    user_interest_predictions2[user_id] = similarity[0][0]
    
print(user_interest_predictions2)

{0: 0.5554324192872526, 1: 0.14573244072709848, 2: 0.0}
