In [41]:
print("Installing requirements....")

Installing requirements....


In [42]:
%pip install faker

Note: you may need to restart the kernel to use updated packages.


In [43]:
import pandas as pd
from faker import Faker
import random
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer
import polars as pl
import numpy as np
import pyarrow.parquet as pq
import uuid


In [44]:
def init_profil():
    df_profil = [
        {
            "Nom": "Fan de la nature",
            "pos": [161902431, 6256757776, 5771053254, 8731063849, 9609413254, 8615466894, 9356148774, 304351397, 304374138],
            "neg": [304850843, 9964615559, 6264550437, 2514192427, 11186120804, 9663434784]
        },
        {
            "Nom": "Historique (médieval)",
            "pos": [9964615559, 2080454089, 8309291920, 9767828163, 10815282268, 11006240930, 6635074831, 205161375, 312031207, 9964615559],
            "neg": [267878387, 6264550437, 6256757776, 304882303, 310235557]
        },
        {
            "Nom": "Monument typique (Grand lieu)",
            "pos": [6580307576, 249292417, 267878387],
            "neg": [8731063849, 8731063849, 6635074831, 9767828163]
        },
        {
            "Nom": "Historique (19,20ème)",
            "pos": [267878387, 6264550437, 2514258897, 2084125343, 407714718, 34050681, 271394429, 310235557, 368996291, 393969161],
            "neg": [312415707, 291231959, 9356148774, 11186120804, 6256757776, 34050692, 304374138]
        },
        {
            "Nom": "Artistique",
            "pos": [2514192427, 9663434784, 251316201, 11186120804, 291231959],
            "neg": [9356148774, 407714718, 11006240930, 9964615559, 310235557, 393969161]
        },
        {
            "Nom": "Antiquité",
            "pos": [4729709152, 874341418, 9864187809, 34050692],
            "neg": [267878387, 9609413254, 271394429, 6635074831, 6580307576]
        },
        {
            "Nom": "Fan de musée",
            "pos": [34050681, 393969161, 34050692, 160079640, 251316201, 252382829, 291231959, 311512162, 34050692],
            "neg": [9609413254, 6580307576, 2084125343, 407714718]
        },
        {
            "Nom": "Histoire (général)",
            "pos": [251316201, 252382829, 258816379, 291232006, 304882303, 310235557],
            "neg": [6256757776, 5771053254, 2514192427, 6580307576]
        },
        {
            "Nom": "Religieux",
            "pos": [251466390, 304850843, 307675986, 304850843],
            "neg": [251316201]
        }
    ]
    df_profil = pd.DataFrame(df_profil)
    return df_profil


In [45]:
def init_user(df_profil):
    # Initialisation de Faker
    fake = Faker()

    nb_profil = 9
    # Génération de 20 profils aléatoires avec noms et prénoms
    noms = [fake.last_name() for _ in range(nb_profil)]
    prenoms = [fake.first_name() for _ in range(nb_profil)]
    profils_aleatoires = [df_profil['Nom'][i] for i in range(nb_profil)]
    ids = [i for i in range(nb_profil)]

    # Création du DataFrame
    df_user = pd.DataFrame({
        "Id": ids,
        "Nom": noms,
        "Prénom": prenoms,
        "Profil": profils_aleatoires
    })

    # Ajout des points positifs (pos) et négatifs (neg) pour chaque profil
    df_user["Pos"] = df_user["Profil"].apply(lambda x: df_profil[df_profil["Nom"] == x]["pos"].values[0])
    df_user["Neg"] = df_user["Profil"].apply(lambda x: df_profil[df_profil["Nom"] == x]["neg"].values[0])

    return df_user



In [46]:
def add_quokka(df_user, pos, neg):
    # Ajout du user 10 (Quokka)
    df_user = df_user.append({
        "Id": 10,
        "Nom": "Quokka",
        "Prénom": "Quokka",
        "Profil": "",  # Mettez le profil approprié pour Quokka
        "Pos": pos,  # Mettez les points positifs appropriés pour Quokka
        "Neg": neg  # Mettez les points négatifs appropriés pour Quokka
    }, ignore_index=True)
    return df_user


In [47]:

def init_poi_data():
    df_geo = pl.read_parquet("../../data/transformed/poi_clean_category_geo.parquet")
    df_poi_pl = df_geo.drop(["type", "geometry"])
    df_poi = df_poi_pl.to_pandas()
    df_poi.head()
    return df_poi


In [48]:

def transform_to_user_item_interactions(df, df_poi):
    interactions = []

    for _, row in df.iterrows():
        user = row["Nom"]
        profile = row["Profil"]
        id = row["Id"]

        # Ajouter les POI aimés comme interactions positives
        for poi_pos in row["Pos"]:
            pos_poi_info = df_poi[df_poi["id"] == poi_pos].index
            pos_poi_info = df_poi.iloc[pos_poi_info[0]]
            interactions.append((id, user, profile, pos_poi_info["id"],pos_poi_info["name"], pos_poi_info["sub_category"], pos_poi_info["category"], 1))

        # Ajouter les POI non aimés comme interactions négatives
        for poi_neg in row["Neg"]:
            neg_poi_info = df_poi[df_poi["id"] == poi_neg].index
            neg_poi_info = df_poi.iloc[neg_poi_info[0]]
            interactions.append((id, user, profile, neg_poi_info["id"], neg_poi_info["name"], neg_poi_info["sub_category"], neg_poi_info["category"], 0))

    return pd.DataFrame(interactions, columns=["Id", "User", "Profil", "POI-id", "POI-name", "POI-SubCat", "POI-Cat", "Liked (0 or 1)"])



In [95]:
def find_similarity(df_user_transformed, df_user):
    from sklearn.metrics.pairwise import cosine_similarity

    # Select the rows corresponding to the target user (id 10)
    target_user_row = df_user_transformed[df_user_transformed["Id"] == 10].iloc[1]

    # Select the rows corresponding to other users
    other_users_rows = df_user_transformed[df_user_transformed["Id"] != 10]

    # Extract the interaction vectors for the target user and other users
    target_user_vector = target_user_row.iloc[7:].values.reshape(1, -1)  # 7 corresponds to the index of the "Liked (0 or 1)" column
    other_users_vectors = other_users_rows.iloc[:, 7:].values

    # Compute cosine similarity
    similarities = cosine_similarity(target_user_vector, other_users_vectors)

    # Create a DataFrame to display the results
    similarity_df = pd.DataFrame({
        "User": other_users_rows["User"].values,
        "Similarity": similarities[0]
    })

    # Sort the DataFrame by similarity in descending order
    similarity_df = similarity_df.sort_values(by="Similarity", ascending=False)

    # Display the result
    #print(similarity_df)

    # Merge similarity_df with df_user to get the details of similar users
    similar_users_details = pd.merge(similarity_df, df_user, left_on="User", right_on="Nom", how="inner")

    # Count the number of times each user has a similarity value of 1 and 0
    user_similarity_counts = (
        similar_users_details.groupby(["User", "Similarity"]).size().unstack(fill_value=0)
    )

    # Sort the DataFrame by the count of 1 in descending order
    user_similarity_counts_sorted = user_similarity_counts.sort_values(by=1, ascending=False)
    print(user_similarity_counts_sorted)
    # Get the user with the most 1 values
    most_similar_user_id = user_similarity_counts_sorted.index[0]
    print(most_similar_user_id)
    # Get the details of the most similar user
    most_similar_user_details = similar_users_details[similar_users_details["User"] == most_similar_user_id]

    return most_similar_user_details


In [96]:
def main_collaborative(pos, neg) :
    print("Init profil....")
    df_profil = init_profil()
    print("Init user....")
    df_user = init_user(df_profil)
    df_user = add_quokka(df_user, pos, neg)
    print("Init POI....")
    df_poi = init_poi_data()
    df_user_transformed = transform_to_user_item_interactions(df_user, df_poi)
    print("Find similariry....")
    most_similarity_user = find_similarity(df_user_transformed, df_user)
    return most_similarity_user

In [97]:
pos = [26860223, 31425173]
neg = [31575884]
user_similarity = main_collaborative(pos, neg)

Init profil....
Init user....
Init POI....
Find similariry....
Similarity  0.0  1.0
User                
Anderson      7   10
Hines         5   10
Gomez         6    9
Mccoy         4    9
York          4    6
King          6    5
Rhodes        1    4
Whitaker      5    4
West          4    3
Anderson
Détails de l'utilisateur le plus similaire (avec le plus de 1 dans Similarity):
User                                                   Anderson
Similarity                                                  1.0
Id                                                            3
Nom                                                    Anderson
Prénom                                                   Justin
Profil                                    Historique (19,20ème)
Pos           [267878387, 6264550437, 2514258897, 2084125343...
Neg           [312415707, 291231959, 9356148774, 11186120804...
Name: 25, dtype: object


  df_user = df_user.append({
