### Network Graph Visualization

In [2]:
import zipfile
import pandas as pd
import math
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [1]:

import networkx as nx
from networkx.drawing.layout import bipartite_layout
! pip install pyvis
from pyvis.network import Network

import random
from collections import Counter
from itertools import combinations



In [4]:
games_metadata_zip_path = 'data/games_metadata.json.zip'
games_zip_path = 'data/games.csv.zip'
recommendations_zip_path = 'data/recommendations.csv.zip'
users_zip_path = 'data/users.csv.zip'

games_metadata_json_filename = 'games_metadata.json'
games_csv_filename = 'games.csv'  
recommendations_csv_filename = 'recommendations.csv'  
users_csv_filename = 'users.csv'  

In [5]:
# Load JSON inside ZIP using lines=True
with zipfile.ZipFile(games_metadata_zip_path) as z:
    with z.open(games_metadata_json_filename) as f:
        games_metadata = pd.read_json(f, lines=True)

# Preview the dataframe
games_metadata.head()

Unnamed: 0,app_id,description,tags
0,13500,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
1,22364,,[Action]
2,113020,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
3,226560,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
4,249050,Dungeon of the Endless is a Rogue-Like Dungeon...,"[Roguelike, Strategy, Tower Defense, Pixel Gra..."


In [6]:
games_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50872 entries, 0 to 50871
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   app_id       50872 non-null  int64 
 1   description  50872 non-null  object
 2   tags         50872 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [7]:
# Open the users zip file
with zipfile.ZipFile(users_zip_path) as z:
    with z.open(users_csv_filename) as f:
        users= pd.read_csv(f)

users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14306064 entries, 0 to 14306063
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   user_id   int64
 1   products  int64
 2   reviews   int64
dtypes: int64(3)
memory usage: 327.4 MB


In [9]:
# Open the games zip file
with zipfile.ZipFile(games_zip_path) as z:
    with z.open(games_csv_filename) as f:
        games = pd.read_csv(f)

games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50872 entries, 0 to 50871
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   app_id          50872 non-null  int64  
 1   title           50872 non-null  object 
 2   date_release    50872 non-null  object 
 3   win             50872 non-null  bool   
 4   mac             50872 non-null  bool   
 5   linux           50872 non-null  bool   
 6   rating          50872 non-null  object 
 7   positive_ratio  50872 non-null  int64  
 8   user_reviews    50872 non-null  int64  
 9   price_final     50872 non-null  float64
 10  price_original  50872 non-null  float64
 11  discount        50872 non-null  float64
 12  steam_deck      50872 non-null  bool   
dtypes: bool(4), float64(3), int64(3), object(3)
memory usage: 3.7+ MB


In [10]:
# Open the recommendations zip file
with zipfile.ZipFile(recommendations_zip_path) as z:
    with z.open(recommendations_csv_filename) as f:
        recommendations = pd.read_csv(f)

recommendations.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41154794 entries, 0 to 41154793
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   app_id          int64  
 1   helpful         int64  
 2   funny           int64  
 3   date            object 
 4   is_recommended  bool   
 5   hours           float64
 6   user_id         int64  
 7   review_id       int64  
dtypes: bool(1), float64(1), int64(5), object(1)
memory usage: 2.2+ GB


In [12]:
# Filter recommendations for "is_recommended = True"
filtered_recommendations = recommendations[recommendations['is_recommended']]

# Count recommendations per game
game_recommendation_counts = filtered_recommendations['app_id'].value_counts()
print('value is counted')

# Filter games with at least 150,000 recommendations
popular_games_index = game_recommendation_counts[game_recommendation_counts >= 150000].index
print('popular games are found')

# Filter recommendations to include only popular games
filtered_recommendations = filtered_recommendations[filtered_recommendations['app_id'].isin(popular_games_index)]
print('games are filtered')
print(filtered_recommendations.info())

# Group by user and create game pairs
user_groups = filtered_recommendations.groupby('user_id')['app_id'].apply(list)
game_pairs = [pair for apps in user_groups for pair in combinations(apps, 2)]

# Count occurrences of each game pair for edge thickness
edge_counts = Counter(game_pairs)

value is counted
popular games are found
games are filtered
<class 'pandas.core.frame.DataFrame'>
Index: 2738448 entries, 2 to 41152032
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   app_id          int64  
 1   helpful         int64  
 2   funny           int64  
 3   date            object 
 4   is_recommended  bool   
 5   hours           float64
 6   user_id         int64  
 7   review_id       int64  
dtypes: bool(1), float64(1), int64(5), object(1)
memory usage: 169.8+ MB
None


In [13]:
# Create a graph using NetworkX
G = nx.Graph()
G.add_edges_from(game_pairs)

# Add node attributes from the games.csv and recommendation count
for _, row in games.iterrows():
    if row['app_id'] in G:
        recommendation_count = game_recommendation_counts.get(row['app_id'], 0)
        G.nodes[row['app_id']]['title'] = row['title']
        G.nodes[row['app_id']]['size'] = recommendation_count  # Node size equals recommendation count

# Create PyVis Network
net = Network(notebook=True)  # Set notebook=True for Jupyter notebooks

for node_id in G.nodes:
    title = G.nodes[node_id].get('title', 'Unknown')
    size = int(G.nodes[node_id].get('size', 1))  # Convert to Python int

    # Add the node to PyVis
    net.add_node(
        int(node_id),  # Ensure node ID is also a native Python int
        label=title,
        size= size / 5000,  
        title=f"{title}\nRecommendations: {size}"
    )
    
 
# Add edges with game names and recommendation counts
for edge in G.edges:
    edge_count = edge_counts.get((edge[0], edge[1]), 1)  # Default to 1 if not found
    game_1_title = G.nodes[edge[0]].get('title', 'Unknown Game')
    game_2_title = G.nodes[edge[1]].get('title', 'Unknown Game')

    # Add edge with enhanced tooltip
    net.add_edge(
        int(edge[0]), 
        int(edge[1]),
        value=edge_count,  # Use edge count for thickness
        title=f"Recommended Together: {edge_count}\nGames: {game_1_title} ↔ {game_2_title}"
    )

# Configure the physics settings 
net.repulsion(
    node_distance=300,  # Distance between nodes
    central_gravity=0.05,  # Gravity towards the center
    spring_length=200,  # Spring length for edges
    spring_strength=0.05,  # Strength of the spring
    damping=0.9,  # Damping factor
)

# Show the graph
net.show("game_network.html")

game_network.html


In this PyVis network plot, each node represents a popular game (recommendation count > 150.000). Node sizes are depending on recommendation count of that game. The edges represent if the same user who recommended the game also recommended the other game. Thicker edges mean more recommendation by same user for both games. For example, people who recommended Witcher 3 also recommended Cyberpunk 2077 (9103 times).