In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Load the datasets
games_updated_df = pd.read_csv('Downloads/Games_Updated.csv')
steam_df = pd.read_csv('Downloads/steam.csv')
vgsales_df = pd.read_csv('Downloads/vgsales-2.csv')

# Extract unique game names
games_updated_names = set(games_updated_df['Game_Name'].str.lower())
steam_names = set(steam_df['Game_Name'].str.lower())
vgsales_names = set(vgsales_df['Game_Name'].str.lower())

# Function to remove years from names
def remove_years(name):
    return re.sub(r'\b(19|20)?\d{2}\b', '', name).strip()

# Apply the year removal and collect unique names
all_game_names = list(set(remove_years(name) for name in games_updated_names | steam_names | vgsales_names))

# Vectorize names using TF-IDF
vectorizer = TfidfVectorizer().fit_transform(all_game_names)
cosine_similarities = cosine_similarity(vectorizer)

# Set threshold for similarity
SIMILARITY_THRESHOLD = 0.8  # Cosine similarity threshold

# Identify similar names based on cosine similarity
similar_names = {}
for idx, name in enumerate(all_game_names):
    similar_indices = cosine_similarities[idx] >= SIMILARITY_THRESHOLD
    similar_matches = [all_game_names[i] for i in range(len(all_game_names)) if similar_indices[i] and i != idx]
    if similar_matches:
        similar_names[name] = similar_matches

# Display results
for base_name, matches in list(similar_names.items())[:10]:  # Show first 10 matches for readability
    print(f"Base Name: {base_name}")
    print("Similar Names:")
    for match in matches:
        print(f" - {match}")
    print()


Base Name: realms of arkania 1 - blade of destiny classic
Similar Names:
 - realms of arkania: blade of destiny

Base Name: raw danger
Similar Names:
 - raw danger!

Base Name: the rift
Similar Names:
 - rift

Base Name: evil glitch
Similar Names:
 - glitch

Base Name: fate/tiger colosseum upper
Similar Names:
 - fate/tiger colosseum

Base Name: metal slug anthology
Similar Names:
 - metal slug 7
 - metal slug 4 & 5
 - metal slug
 - metal slug x
 - metal slug 2
 - metal slug 3

Base Name: blade arcus from shining ex
Similar Names:
 - blade arcus from shining: battle arena

Base Name: victoria ii
Similar Names:
 - victoria

Base Name: insane
Similar Names:
 - insane 2
 - insane road

Base Name: time
Similar Names:
 - time in time

