In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import warnings

warnings.filterwarnings("ignore")


In [9]:
def load_and_merge_data(filepath1, filepath2):
    """
    Load and merge steam.csv and steam_description_data.csv on appid.
    """
    # Load datasets
    steam_data = pd.read_csv(filepath1)
    description_data = pd.read_csv(filepath2)

    # Merge datasets on appid
    merged_data = pd.merge(steam_data, description_data, left_on='appid', right_on='steam_appid')

    # Drop duplicates and handle missing values
    merged_data.drop_duplicates(subset=['appid'], inplace=True)
    merged_data.dropna(subset=['detailed_description', 'genres'], inplace=True)
    
    return merged_data

# Example Usage
data = load_and_merge_data('steam.csv', 'steam_description_data.csv')
print(data.head())


   appid                       name release_date  english         developer  \
0     10             Counter-Strike   2000-11-01        1             Valve   
1     20      Team Fortress Classic   1999-04-01        1             Valve   
2     30              Day of Defeat   2003-05-01        1             Valve   
3     40         Deathmatch Classic   2001-06-01        1             Valve   
4     50  Half-Life: Opposing Force   1999-11-01        1  Gearbox Software   

  publisher          platforms  required_age  \
0     Valve  windows;mac;linux             0   
1     Valve  windows;mac;linux             0   
2     Valve  windows;mac;linux             0   
3     Valve  windows;mac;linux             0   
4     Valve  windows;mac;linux             0   

                                          categories  genres  ...  \
0  Multi-player;Online Multi-Player;Local Multi-P...  Action  ...   
1  Multi-player;Online Multi-Player;Local Multi-P...  Action  ...   
2              Multi-player;V

In [12]:
import re

def preprocess_text(text):
    """
    Clean and normalize text data.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

# Apply preprocessing
data['processed_description'] = data['detailed_description'].apply(preprocess_text)


In [16]:
def create_tfidf_features(text_data):
    """
    Create TF-IDF vectors from game descriptions.
    """
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    tfidf_matrix = vectorizer.fit_transform(text_data)
    return tfidf_matrix

# Generate TF-IDF matrix
tfidf_matrix = create_tfidf_features(data['processed_description'])


In [18]:
def create_hybrid_features(tfidf_matrix, metadata):
    """
    Combine text vectors with metadata features.
    """
    # One-hot encode genres
    genres_onehot = pd.get_dummies(metadata['genres'], prefix='genre')
    
    # Combine TF-IDF and one-hot encoding
    hybrid_features = np.hstack((tfidf_matrix.toarray(), genres_onehot.values))
    return hybrid_features

# Generate hybrid features
hybrid_features = create_hybrid_features(tfidf_matrix, data)


In [20]:
def compute_similarity(feature_matrix):
    """
    Compute similarity using cosine similarity.
    """
    similarity_matrix = cosine_similarity(feature_matrix)
    return similarity_matrix

# Compute similarity matrix
similarity_matrix = compute_similarity(tfidf_matrix)


In [26]:
def get_recommendations(game_id, data, similarity_matrix, n=5):
    """
    Generate top-N similar game recommendations.
    """
    # Find index of the game by appid
    game_index = data.index[data['appid'] == game_id][0]
    similarity_scores = similarity_matrix[game_index]
    similar_games = sorted(list(enumerate(similarity_scores)), key=lambda x: x[1], reverse=True)
    recommended_indices = [i[0] for i in similar_games[1:n+1]]  # Exclude the game itself
    return data.iloc[recommended_indices][['appid', 'name', 'genres']]

# Example recommendations
recommendations = get_recommendations(game_id=730, data=data, similarity_matrix=similarity_matrix, n=5)
print(recommendations)


        appid                          name  \
115      3900  Sid Meier's Civilization® IV   
9         220                   Half-Life 2   
12063  557040                       99Vidas   
1916   248570                      Toribash   
831     45700               Devil May Cry 4   

                                   genres  
115                              Strategy  
9                                  Action  
12063                        Action;Indie  
1916   Action;Free to Play;Indie;Strategy  
831                      Action;Adventure  


In [28]:
def evaluate_performance():
    """
    Evaluate computation time, memory usage, and quality of recommendations.
    """
    # Placeholder: You can log performance metrics here
    print("TF-IDF Vectorization: Done")
    print("Similarity Computation: Done")
    print("Recommendations Generated: Done")


In [30]:
def show_example_recommendations():
    """
    Show recommendations for a few sample games.
    """
    for appid in [730, 440, 570]:  # Example appids
        print(f"Recommendations for Game ID {appid}:")
        print(get_recommendations(appid, data, similarity_matrix, n=5))
        print("\n")


In [35]:
show_example_recommendations()

Recommendations for Game ID 730:
        appid                          name  \
115      3900  Sid Meier's Civilization® IV   
9         220                   Half-Life 2   
12063  557040                       99Vidas   
1916   248570                      Toribash   
831     45700               Devil May Cry 4   

                                   genres  
115                              Strategy  
9                                  Action  
12063                        Action;Indie  
1916   Action;Free to Play;Indie;Strategy  
831                      Action;Adventure  


Recommendations for Game ID 440:
         appid                                               name  \
26561  1025070   淑女同萌！-New Division- / Hello Lady! -New Division-   
6031    380840                                          Teeworlds   
1           20                              Team Fortress Classic   
139       4780                   Medieval II: Total War™ Kingdoms   
667      35450  Red Orchestra 2: Heroes 