In [28]:
import pandas as pd

# Load the dataset
file_path = 'vgsales.csv'
df = pd.read_csv(file_path)

# Check the first 5 rows
df.head()


Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


In [31]:
# Check dataset information
df.info()

# Summary statistics
df.describe()

# Check for null values
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16598 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16598 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB


Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

In [30]:
# Null Value Treatment:
# Fill numeric columns with median, categorical with mode
for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].fillna(df[column].mode()[0])
    else:
        df[column] = df[column].fillna(df[column].median())

# Check again
df.isnull().sum()


Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

In [79]:
import numpy as np
from scipy import stats

# Check for outliers in numeric columns
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()

# Z-score method - Flag values beyond 3 standard deviations
z_scores = np.abs(stats.zscore(df[numeric_columns]))

# Find rows with outliers
outliers = (z_scores > 3).any(axis=1)
print(f"Number of outliers detected: {outliers.sum()}")

# Option 1 - Remove outliers
df = df[~outliers]

# Option 2 - Cap outliers to the 5th and 95th percentile (Winsorization)
# for column in numeric_columns:
#     lower = df[column].quantile(0.05)
#     upper = df[column].quantile(0.95)
#     df[column] = np.clip(df[column], lower, upper)

# Re-check distribution
df.describe()


Number of outliers detected: 0


Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,6186.0,6186.0,6186.0,6186.0,6186.0,6186.0,6186.0
mean,11519.33301,2006.571452,0.066458,0.021482,0.0,0.006166,0.094969
std,2782.676155,4.443464,0.056625,0.021247,0.0,0.006683,0.066904
min,5963.0,1994.0,0.0,0.0,0.0,0.0,0.01
25%,9247.5,2003.0,0.02,0.0,0.0,0.0,0.04
50%,11490.5,2007.0,0.05,0.02,0.0,0.01,0.08
75%,13820.75,2010.0,0.1,0.03,0.0,0.01,0.14
max,16600.0,2016.0,0.23,0.08,0.0,0.02,0.29


In [85]:
# Check categorical columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()

# Apply Label Encoding or One-Hot Encoding if needed
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

# Check new structure
df.head()


Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Name_007: Quantum of Solace,Name_1 vs. 100,Name_10 Minute Solution,...,Publisher_Xseed Games,Publisher_Yacht Club Games,Publisher_Yuke's,Publisher_Zoo Digital Publishing,Publisher_Zoo Games,Publisher_Zushi Games,Publisher_bitComposer Games,Publisher_iWin,Publisher_inXile Entertainment,Publisher_responDESIGN
5961,5963,2006.0,0.21,0.06,0.0,0.02,0.29,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5968,5970,2003.0,0.21,0.08,0.0,0.01,0.29,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5975,5977,2002.0,0.22,0.06,0.0,0.01,0.29,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5986,5988,2005.0,0.23,0.06,0.0,0.01,0.29,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6009,6011,2005.0,0.22,0.06,0.0,0.01,0.29,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [94]:
# Check final shape and data types
df.info()

# Save the cleaned dataset if needed
# df.to_csv('cleaned_vgsales.csv', index=False)


<class 'pandas.core.frame.DataFrame'>
Index: 6186 entries, 5961 to 16597
Columns: 5131 entries, Rank to Publisher_responDESIGN
dtypes: bool(5124), float64(6), int64(1)
memory usage: 30.6 MB
