In [50]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from gensim.models.phrases import Phrases, Phraser
import string
import torch
from sentence_transformers import SentenceTransformer, util
import re

# Function to reduce the memory usage of a DataFrame.
def reduce_memory(df):
    for col in df.columns:
        if df[col].dtype == 'float64':
            df[col] = df[col].astype('float32')
        if df[col].dtype == 'int64':
            df[col] = df[col].astype('int32')
    return df

# Generator function to load data in chunks.
def data_generator(df, chunksize=10000):
    for i in range(0, df.shape[0], chunksize):
        yield df.iloc[i:i+chunksize]

df = reduce_memory(pd.read_csv("data/Dataset.csv"))

df_s2v = df[['Game', 'About the game']].drop_duplicates()

def preprocess_description(description):
    if pd.isna(description):
        return ""
    description = re.sub(r'[^a-zA-Z0-9\s]', '', description)
    description = description.lower()
    description = description.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(description)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return tokens
    # return ' '.join(tokens)

df_s2v['About the game'] = df_s2v['About the game'].apply(lambda x: preprocess_description(x))

# Build Bigrams
# phrases = Phrases(df_s2v['About the game'])
# bigram = Phraser(phrases)
# df_s2v['About the game'] = df_s2v['About the game'].apply(lambda x: bigram[x])

# Build token in text
df_s2v['About the game'] = df_s2v['About the game'].apply(lambda x: ' '.join(x))

df_s2v.head()

Unnamed: 0,Game,About the game
0,The Elder Scrolls V Skyrim,epic fantasi reborn next chapter highli antici...
677,Fallout 4,bethesda game studio awardwin creator fallout ...
844,Fallout New Vegas,welcom vega new vega kind town dig grave prior...
1131,Left 4 Dead 2,set zombi apocalyps left 4 dead 2 l4d2 highli ...
1932,HuniePop,huniepop uniqu sim experi pc mac linux gamepla...


In [65]:
# model = SentenceTransformer("sentence-transformers/distiluse-base-multilingual-cased-v2")
# model.save('./models/distiluse-base-multilingual-cased-v2')

loaded_model = SentenceTransformer('./models/distiluse-base-multilingual-cased-v2')

In [None]:
all_descriptions = df_s2v['About the game'].tolist()
all_embeddings = loaded_model.encode(all_descriptions)

In [54]:
len(all_descriptions[2])
all_descriptions[2]

'welcom vega new vega kind town dig grave prior shot head left deadand that thing realli get ugli town dreamer desperado torn apart war faction vy complet control desert oasi place right kind person right kind weaponri realli make name make enemi two along way battl way across heatblast mojav wasteland coloss hoover dam neon drench vega strip youll introduc color cast charact powerhungri faction special weapon mutat creatur much choos side upcom war declar winner take crown king new vega followup 2008 videogam year fallout 3 enjoy stay key featur feel heat new vega even nuclear fallout could slow hustl sin citi explor vast expans desert wasteland small town dot mojav wasteland bright light new vega strip see great southwest could imagin fallout feud faction color charact host hostil war brew rival faction consequ chang live inhabit new vega choic make bring contact countless charact creatur alli foe determin final explos outcom epic power struggl new system enjoy new addit fallout new 

In [55]:
len(all_embeddings[0])
# all_embeddings[0]

512

In [58]:
# Fonction pour obtenir les jeux recommandés
def get_recommendations(input_embedding, df_s2v, all_embeddings, top_n=15):
    # Calculer la similarité cosinus entre le jeu d'entrée et tous les autres jeux
    similarities = util.pytorch_cos_sim(torch.tensor([input_embedding]), torch.tensor(all_embeddings))[0]

    # Obtenir les indices des jeux les plus similaires
    similar_indices = similarities.argsort(descending=True)[1:top_n+1]

    # Obtenir les jeux les plus similaires et leurs valeurs de similarité
    similar_games = df_s2v.iloc[similar_indices]['Game'].tolist()
    similarity_values = similarities[similar_indices].tolist()

    return list(zip(similar_games, similarity_values))

In [59]:
# Exemple d'utilisation
input_title = "Call of Duty Black Ops"
input_description = df_s2v[df_s2v['Game'] == input_title]['About the game'].values[0]
input_embedding = loaded_model.encode([input_description])[0]
recommendations = get_recommendations(input_embedding, df_s2v, all_embeddings)

print(f"Input Title: {input_title}")
print("Recommendations:")
for game, similarity in recommendations:
    print(f"{game} - Similarity: {similarity:.4f}")

Input Title: Call of Duty Black Ops
Recommendations:
Call of Duty Modern Warfare 3 - Similarity: 0.7037
Call of Duty - Similarity: 0.7022
Call of Duty 4 Modern Warfare - Similarity: 0.6817
Call of Duty World at War - Similarity: 0.6733
Return to Castle Wolfenstein - Similarity: 0.6448
Section 8 Prejudice - Similarity: 0.6409
Dishonored - Similarity: 0.6346
Star Wolves 3 Civil War - Similarity: 0.6327
Avadon The Black Fortress - Similarity: 0.6297
Enclave - Similarity: 0.6281
Project Snowblind - Similarity: 0.6275
Supreme Commander 2 - Similarity: 0.6215
Serious Sam 3 BFE - Similarity: 0.6181
Warhammer End Times - Vermintide - Similarity: 0.6170
SanctuaryRPG Black Edition - Similarity: 0.6146
