In [21]:
# ----------------------------------------------------------------------------
# Title: Assignment 1.2
# Author: Surenther Selvaraj
# Date: 13 September 2025
# Modified By: Surenther Selvaraj
# Description: Exploring a Pandas Data Frame
# ----------------------------------------------------------------------------

In [22]:
#Load Data from Kaggle
import pandas as pd
import numpy as np
import kagglehub
from kagglehub import KaggleDatasetAdapter

# The path to the dataset on Kaggle
dataset_path = "rush4ratio/video-game-sales-with-ratings"

# The file name of the dataset
file_name = "Video_Games_Sales_as_at_22_Dec_2016.csv"


In [23]:
# Load the dataset directly into a pandas DataFrame
df = kagglehub.dataset_load(
    KaggleDatasetAdapter.PANDAS,
    dataset_path,
    file_name
)

# Display the first few rows of the DataFrame
print(df.head(10))

                        Name Platform  Year_of_Release         Genre  \
0                 Wii Sports      Wii           2006.0        Sports   
1          Super Mario Bros.      NES           1985.0      Platform   
2             Mario Kart Wii      Wii           2008.0        Racing   
3          Wii Sports Resort      Wii           2009.0        Sports   
4   Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing   
5                     Tetris       GB           1989.0        Puzzle   
6      New Super Mario Bros.       DS           2006.0      Platform   
7                   Wii Play      Wii           2006.0          Misc   
8  New Super Mario Bros. Wii      Wii           2009.0      Platform   
9                  Duck Hunt      NES           1984.0       Shooter   

  Publisher  NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  \
0  Nintendo     41.36     28.96      3.77         8.45         82.53   
1  Nintendo     29.08      3.58      6.81         0.77         

In [24]:
# Find the dimensions of the DataFrame (rows, columns)
dimensions = df.shape
print(f"The dimensions of the DataFrame are: {dimensions}")

The dimensions of the DataFrame are: (16719, 16)


The first number, 17415, represents the number of rows. In this dataset, each row corresponds to a single video game and its associated sales data and ratings.

The second number, 16, represents the number of columns. Each column represents a specific attribute or feature of the video game, such as its name, platform, release year, genre, publisher, and various sales figures.

In [25]:
# Sort the DataFrame by 'Critic_Score' in descending order and get the top 5.
# also dropped any rows where the 'Critic_Score' is missing to ensure a clean sort.
top_five_games = df.dropna(subset=['Critic_Score']).sort_values(
    by='Critic_Score',
    ascending=False
).head(5)

# Display the result
print("Top five games by critic score:")
print(top_five_games[['Name', 'Critic_Score']])

Top five games by critic score:
                          Name  Critic_Score
227   Tony Hawk's Pro Skater 2          98.0
57         Grand Theft Auto IV          98.0
51         Grand Theft Auto IV          98.0
5350               SoulCalibur          98.0
165         Grand Theft Auto V          97.0


In [26]:
# Find the number of video games in each genre
genre_counts = df['Genre'].value_counts()

# Display the result
print("Number of video games in each genre:")
print(genre_counts)

Number of video games in each genre:
Genre
Action          3370
Sports          2348
Misc            1750
Role-Playing    1500
Shooter         1323
Adventure       1303
Racing          1249
Platform         888
Simulation       874
Fighting         849
Strategy         683
Puzzle           580
Name: count, dtype: int64


In [27]:
# Filter for SNES games and sort by release year in ascending order
snes_games = df[df['Platform'] == 'SNES'].sort_values(
    by='Year_of_Release',
    ascending=True
)

# Display the first five games
print("First five games on the SNES platform, sorted by release year:")
print(snes_games.head(5))

First five games on the SNES platform, sorted by release year:
                   Name Platform  Year_of_Release       Genre Publisher  \
18    Super Mario World     SNES           1990.0    Platform  Nintendo   
1195        Final Fight     SNES           1990.0      Action    Capcom   
511              F-Zero     SNES           1990.0      Racing  Nintendo   
1791         Pilotwings     SNES           1990.0  Simulation  Nintendo   
1111      Super Scope 6     SNES           1991.0     Shooter  Nintendo   

      NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
18       12.78      3.75      3.54         0.55         20.61           NaN   
1195      0.67      0.17      0.69         0.03          1.56           NaN   
511       1.37      0.51      0.89         0.07          2.85           NaN   
1791      0.46      0.17      0.48         0.02          1.14           NaN   
1111      1.06      0.38      0.15         0.05          1.65           NaN   

      Criti

In [28]:
# Calculate the total global sales for each publisher
publisher_sales = df.groupby('Publisher')['Global_Sales'].sum()

# Sort the results in descending order and get the top five
top_five_publishers = publisher_sales.sort_values(ascending=False).head(5)

# Display the result
print("Top five publishers by total global sales:")
print(top_five_publishers)

Top five publishers by total global sales:
Publisher
Nintendo                       1788.81
Electronic Arts                1116.96
Activision                      731.16
Sony Computer Entertainment     606.48
Ubisoft                         471.61
Name: Global_Sales, dtype: float64


In [29]:
# Calculate the percentage of global sales from North America
# We use .fillna(0) to handle any cases where Global_Sales might be 0, to avoid division by zero errors.
df['NA_Sales_Percentage'] = (df['NA_Sales'] / df['Global_Sales']).fillna(0) * 100

# Display the first five rows of the updated DataFrame
print("First five rows with the new 'NA_Sales_Percentage' column:")
print(df.head())


First five rows with the new 'NA_Sales_Percentage' column:
                       Name Platform  Year_of_Release         Genre Publisher  \
0                Wii Sports      Wii           2006.0        Sports  Nintendo   
1         Super Mario Bros.      NES           1985.0      Platform  Nintendo   
2            Mario Kart Wii      Wii           2008.0        Racing  Nintendo   
3         Wii Sports Resort      Wii           2009.0        Sports  Nintendo   
4  Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing  Nintendo   

   NA_Sales  EU_Sales  JP_Sales  Other_Sales  Global_Sales  Critic_Score  \
0     41.36     28.96      3.77         8.45         82.53          76.0   
1     29.08      3.58      6.81         0.77         40.24           NaN   
2     15.68     12.76      3.79         3.29         35.52          82.0   
3     15.61     10.93      3.28         2.95         32.77          80.0   
4     11.27      8.89     10.22         1.00         31.37           NaN  

In [30]:
# Find the number of NaN entries in each column
nan_counts = df.isnull().sum()

# Display the result
print("Number of missing values (NaN) in each column:")
print(nan_counts)

Number of missing values (NaN) in each column:
Name                      2
Platform                  0
Year_of_Release         269
Genre                     2
Publisher                54
NA_Sales                  0
EU_Sales                  0
JP_Sales                  0
Other_Sales               0
Global_Sales              0
Critic_Score           8582
Critic_Count           8582
User_Score             6704
User_Count             9129
Developer              6623
Rating                 6769
NA_Sales_Percentage       0
dtype: int64


In [31]:
# Replace the non-numerical string (e.g., 'tbd') with NaN
df['User_Score'] = df['User_Score'].replace('tbd', np.nan)

# Convert the column to a float type to ensure proper calculations
df['User_Score'] = pd.to_numeric(df['User_Score'])

# Calculate the median of the cleaned user scores, ignoring NaN values
median_user_score = df['User_Score'].median()

# Replace all NaN entries in the 'User_Score' column with the calculated median
df['User_Score'] = df['User_Score'].fillna(median_user_score)

# Display the updated user score column and the calculated median for verification
print(f"Calculated median user score: {median_user_score}")
print("\nFirst 10 entries of the updated 'User_Score' column:")
print(df['User_Score'].head(10))
print("\nCheck for remaining NaN values (should be 0):")
print(df['User_Score'].isnull().sum())

Calculated median user score: 7.5

First 10 entries of the updated 'User_Score' column:
0    8.0
1    7.5
2    8.3
3    8.0
4    7.5
5    7.5
6    8.5
7    6.6
8    8.4
9    7.5
Name: User_Score, dtype: float64

Check for remaining NaN values (should be 0):
0
