# Import Dependencies

In [16]:
import pandas as pd
import os
from datetime import datetime
import numpy as np

# Read Video Games Data from CSV

In [17]:
# Set CSV file path
path = os.path.join('Output', 'Extracted_video_games_data.csv')

In [18]:
# Read games data from CSV file into Dataframe
games_df = pd.read_csv(path)
games_df

Unnamed: 0,Pos,Game,Game.1,Console,Publisher,Developer,VGChartz Score,Critic Score,User Score,Total Shipped,Total Sales,NA Sales,PAL Sales,Japan Sales,Other Sales,Release Date,Last Update,Genre
0,1,,God of War,Series,Sony Computer Entertainment,SIE Santa Monica Studio,,,,51.00m,,,,,,22nd Mar 05,04th Mar 20,Action
1,2,,Warriors,Series,KOEI,Omega Force,,,,45.26m,,,,,,30th Jun 97,24th Mar 20,Action
2,3,,Devil May Cry,Series,Capcom,Capcom,,,,22.00m,,,,,,16th Oct 01,03rd Feb 20,Action
3,4,,Dynasty Warriors,Series,Unknown,Omega Force,,,,21.00m,,,,,,,24th Mar 20,Action
4,5,,Grand Theft Auto V,PS3,Rockstar Games,Rockstar North,,9.4,,,20.32m,6.37m,9.85m,0.99m,3.12m,17th Sep 13,,Action
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58892,436,,World End Syndrome,PS4,Arc System Works,Arc System Works,,,,,,,,,,26th Apr 18,03rd Apr 19,Visual Novel
58893,437,,XBlaze Lost: Memories,PC,Aksys Games,Arc System Works,,,,,,,,,,11th Aug 16,28th Jan 19,Visual Novel
58894,438,,"Yoru, Tomosu",NS,Nippon Ichi Software,Nippon Ichi Software,,,,,,,,,,30th Jul 20,09th May 20,Visual Novel
58895,439,,"Yoru, Tomosu",PS4,Nippon Ichi Software,Nippon Ichi Software,,,,,,,,,,30th Jul 20,09th May 20,Visual Novel


# Delete games without sales data

In [19]:
# Find number of games without sales data
len(games_df.loc[games_df['NA Sales'].isna() &
                 games_df['PAL Sales'].isna() &
                 games_df['Japan Sales'].isna() &
                 games_df['Other Sales'].isna() &
                 games_df['Total Sales'].isna() &
                 games_df['Total Shipped'].isna()])

36795

In [20]:
# Determine how many games have some sales data
len(games_df) - len(games_df.loc[games_df['NA Sales'].isna() &
                    games_df['PAL Sales'].isna() &
                    games_df['Japan Sales'].isna() &
                    games_df['Other Sales'].isna() &
                    games_df['Total Sales'].isna() &
                    games_df['Total Shipped'].isna()])

22102

In [21]:
# Eliminate games without sales data
cleaned_df = games_df.loc[games_df['NA Sales'].notna() |
                          games_df['PAL Sales'].notna() |
                          games_df['Japan Sales'].notna() |
                          games_df['Other Sales'].notna() |
                          games_df['Total Sales'].notna() |
                          games_df['Total Shipped'].notna()]
len(cleaned_df)

22102

In [22]:
cleaned_df

Unnamed: 0,Pos,Game,Game.1,Console,Publisher,Developer,VGChartz Score,Critic Score,User Score,Total Shipped,Total Sales,NA Sales,PAL Sales,Japan Sales,Other Sales,Release Date,Last Update,Genre
0,1,,God of War,Series,Sony Computer Entertainment,SIE Santa Monica Studio,,,,51.00m,,,,,,22nd Mar 05,04th Mar 20,Action
1,2,,Warriors,Series,KOEI,Omega Force,,,,45.26m,,,,,,30th Jun 97,24th Mar 20,Action
2,3,,Devil May Cry,Series,Capcom,Capcom,,,,22.00m,,,,,,16th Oct 01,03rd Feb 20,Action
3,4,,Dynasty Warriors,Series,Unknown,Omega Force,,,,21.00m,,,,,,,24th Mar 20,Action
4,5,,Grand Theft Auto V,PS3,Rockstar Games,Rockstar North,,9.4,,,20.32m,6.37m,9.85m,0.99m,3.12m,17th Sep 13,,Action
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58688,232,,"Nora, Princess, and Stray Cat",NS,Harukaze,Harukaze,,,,,0.00m,,,0.00m,,25th Oct 18,07th Nov 18,Visual Novel
58689,233,,Memories Off: Innocent File,NS,5pb,5pb. Games,,,,,0.00m,,,0.00m,,25th Oct 18,07th Nov 18,Visual Novel
58690,234,,Enkan no Memoria: Kakera Tomoshi,PSV,Dramatic Create,A'sRing,,,,,0.00m,,,0.00m,,29th Mar 18,07th Apr 18,Visual Novel
58691,235,,Disorder 6,X360,5pb,5pb. Games,,,,,0.00m,,,0.00m,,22nd Aug 13,02nd May 19,Visual Novel


# Delete unwanted columns

In [23]:
cleaned_df.columns

Index(['Pos', 'Game', 'Game.1', 'Console', 'Publisher', 'Developer',
       'VGChartz Score', 'Critic Score', 'User Score', 'Total Shipped',
       'Total Sales', 'NA Sales', 'PAL Sales', 'Japan Sales', 'Other Sales',
       'Release Date', 'Last Update', 'Genre'],
      dtype='object')

In [24]:
# Check if data exists in Game column
len(cleaned_df.loc[cleaned_df['Game'].notna()])

0

In [25]:
# Removing game and last update columns
cleaned_df = cleaned_df[['Pos',
                         'Game.1',
                         'Console',
                         'Publisher',
                         'Developer',
                         'VGChartz Score',
                         'Critic Score',
                         'User Score',
                         'Total Shipped',
                         'Total Sales',
                         'NA Sales',
                         'PAL Sales',
                         'Japan Sales',
                         'Other Sales',
                         'Release Date',
                         'Genre']]

In [26]:
cleaned_df.columns

Index(['Pos', 'Game.1', 'Console', 'Publisher', 'Developer', 'VGChartz Score',
       'Critic Score', 'User Score', 'Total Shipped', 'Total Sales',
       'NA Sales', 'PAL Sales', 'Japan Sales', 'Other Sales', 'Release Date',
       'Genre'],
      dtype='object')

# Rename columns

In [27]:
# Rename columns
renamed_df = cleaned_df.rename(columns={
    'Pos' : 'rank', 
    'Game.1' : 'game_name',
    'Console' : 'console',
    'Publisher' : 'publisher',
    'Developer' : 'developer',
    'VGChartz Score' : 'vgchartz_score',
    'Critic Score' : 'critic_score',
    'User Score' : 'user_score',
    'Total Shipped' : 'total_shipped',
    'Total Sales' : 'global_sales',
    'NA Sales' : 'na_sales',
    'PAL Sales' : 'pal_sales',
    'Japan Sales' : 'japan_sales',
    'Other Sales' : 'other_sales',
    'Release Date' : 'release_date',
    'Genre' : 'genre'
})
renamed_df

Unnamed: 0,rank,game_name,console,publisher,developer,vgchartz_score,critic_score,user_score,total_shipped,global_sales,na_sales,pal_sales,japan_sales,other_sales,release_date,genre
0,1,God of War,Series,Sony Computer Entertainment,SIE Santa Monica Studio,,,,51.00m,,,,,,22nd Mar 05,Action
1,2,Warriors,Series,KOEI,Omega Force,,,,45.26m,,,,,,30th Jun 97,Action
2,3,Devil May Cry,Series,Capcom,Capcom,,,,22.00m,,,,,,16th Oct 01,Action
3,4,Dynasty Warriors,Series,Unknown,Omega Force,,,,21.00m,,,,,,,Action
4,5,Grand Theft Auto V,PS3,Rockstar Games,Rockstar North,,9.4,,,20.32m,6.37m,9.85m,0.99m,3.12m,17th Sep 13,Action
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58688,232,"Nora, Princess, and Stray Cat",NS,Harukaze,Harukaze,,,,,0.00m,,,0.00m,,25th Oct 18,Visual Novel
58689,233,Memories Off: Innocent File,NS,5pb,5pb. Games,,,,,0.00m,,,0.00m,,25th Oct 18,Visual Novel
58690,234,Enkan no Memoria: Kakera Tomoshi,PSV,Dramatic Create,A'sRing,,,,,0.00m,,,0.00m,,29th Mar 18,Visual Novel
58691,235,Disorder 6,X360,5pb,5pb. Games,,,,,0.00m,,,0.00m,,22nd Aug 13,Visual Novel


# Add Release Year Column

In [28]:
# Add release year column

renamed_df['release_year'] = np.NaN

current_year = int(datetime.now().strftime('%y'))

for index, row in renamed_df.loc[renamed_df['release_date'].notna()].iterrows():
    release_dt = row[14]
    year = release_dt.split(' ')[2]
    if int(year) <= current_year:
        release_year = '20' + year
    else:
        release_year = '19' + year
    renamed_df.loc[index, 'release_year'] = release_year

renamed_df

Unnamed: 0,rank,game_name,console,publisher,developer,vgchartz_score,critic_score,user_score,total_shipped,global_sales,na_sales,pal_sales,japan_sales,other_sales,release_date,genre,release_year
0,1,God of War,Series,Sony Computer Entertainment,SIE Santa Monica Studio,,,,51.00m,,,,,,22nd Mar 05,Action,2005
1,2,Warriors,Series,KOEI,Omega Force,,,,45.26m,,,,,,30th Jun 97,Action,1997
2,3,Devil May Cry,Series,Capcom,Capcom,,,,22.00m,,,,,,16th Oct 01,Action,2001
3,4,Dynasty Warriors,Series,Unknown,Omega Force,,,,21.00m,,,,,,,Action,
4,5,Grand Theft Auto V,PS3,Rockstar Games,Rockstar North,,9.4,,,20.32m,6.37m,9.85m,0.99m,3.12m,17th Sep 13,Action,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58688,232,"Nora, Princess, and Stray Cat",NS,Harukaze,Harukaze,,,,,0.00m,,,0.00m,,25th Oct 18,Visual Novel,2018
58689,233,Memories Off: Innocent File,NS,5pb,5pb. Games,,,,,0.00m,,,0.00m,,25th Oct 18,Visual Novel,2018
58690,234,Enkan no Memoria: Kakera Tomoshi,PSV,Dramatic Create,A'sRing,,,,,0.00m,,,0.00m,,29th Mar 18,Visual Novel,2018
58691,235,Disorder 6,X360,5pb,5pb. Games,,,,,0.00m,,,0.00m,,22nd Aug 13,Visual Novel,2013


In [29]:
# Check the release year data type
renamed_df.dtypes

rank                int64
game_name          object
console            object
publisher          object
developer          object
vgchartz_score    float64
critic_score      float64
user_score        float64
total_shipped      object
global_sales       object
na_sales           object
pal_sales          object
japan_sales        object
other_sales        object
release_date       object
genre              object
release_year       object
dtype: object

In [30]:
# Convert release year to float data type
renamed_df['release_year'] = renamed_df['release_year'].astype('float64')

In [31]:
# Convert release year to Int data type
renamed_df['release_year'] = renamed_df['release_year'].astype('Int32')

In [32]:
# Confirm release year Int
renamed_df.dtypes

rank                int64
game_name          object
console            object
publisher          object
developer          object
vgchartz_score    float64
critic_score      float64
user_score        float64
total_shipped      object
global_sales       object
na_sales           object
pal_sales          object
japan_sales        object
other_sales        object
release_date       object
genre              object
release_year        Int32
dtype: object

In [33]:
# Check the release year column values
renamed_df

Unnamed: 0,rank,game_name,console,publisher,developer,vgchartz_score,critic_score,user_score,total_shipped,global_sales,na_sales,pal_sales,japan_sales,other_sales,release_date,genre,release_year
0,1,God of War,Series,Sony Computer Entertainment,SIE Santa Monica Studio,,,,51.00m,,,,,,22nd Mar 05,Action,2005
1,2,Warriors,Series,KOEI,Omega Force,,,,45.26m,,,,,,30th Jun 97,Action,1997
2,3,Devil May Cry,Series,Capcom,Capcom,,,,22.00m,,,,,,16th Oct 01,Action,2001
3,4,Dynasty Warriors,Series,Unknown,Omega Force,,,,21.00m,,,,,,,Action,
4,5,Grand Theft Auto V,PS3,Rockstar Games,Rockstar North,,9.4,,,20.32m,6.37m,9.85m,0.99m,3.12m,17th Sep 13,Action,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58688,232,"Nora, Princess, and Stray Cat",NS,Harukaze,Harukaze,,,,,0.00m,,,0.00m,,25th Oct 18,Visual Novel,2018
58689,233,Memories Off: Innocent File,NS,5pb,5pb. Games,,,,,0.00m,,,0.00m,,25th Oct 18,Visual Novel,2018
58690,234,Enkan no Memoria: Kakera Tomoshi,PSV,Dramatic Create,A'sRing,,,,,0.00m,,,0.00m,,29th Mar 18,Visual Novel,2018
58691,235,Disorder 6,X360,5pb,5pb. Games,,,,,0.00m,,,0.00m,,22nd Aug 13,Visual Novel,2013


# Convert Release date values to Date format (yyyy-mm-dd)

In [34]:
# Convert release date values to date format
for index, row in renamed_df.loc[renamed_df['release_date'].notna()].iterrows():
    release_dt = row[14]
    release_date = release_dt[:2]+release_dt[4:]
    formatted_release_date = datetime.strptime(release_date, '%d %b %y').strftime('%Y-%m-%d')
    renamed_df.loc[index, 'release_date'] = formatted_release_date

renamed_df

Unnamed: 0,rank,game_name,console,publisher,developer,vgchartz_score,critic_score,user_score,total_shipped,global_sales,na_sales,pal_sales,japan_sales,other_sales,release_date,genre,release_year
0,1,God of War,Series,Sony Computer Entertainment,SIE Santa Monica Studio,,,,51.00m,,,,,,2005-03-22,Action,2005
1,2,Warriors,Series,KOEI,Omega Force,,,,45.26m,,,,,,1997-06-30,Action,1997
2,3,Devil May Cry,Series,Capcom,Capcom,,,,22.00m,,,,,,2001-10-16,Action,2001
3,4,Dynasty Warriors,Series,Unknown,Omega Force,,,,21.00m,,,,,,,Action,
4,5,Grand Theft Auto V,PS3,Rockstar Games,Rockstar North,,9.4,,,20.32m,6.37m,9.85m,0.99m,3.12m,2013-09-17,Action,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58688,232,"Nora, Princess, and Stray Cat",NS,Harukaze,Harukaze,,,,,0.00m,,,0.00m,,2018-10-25,Visual Novel,2018
58689,233,Memories Off: Innocent File,NS,5pb,5pb. Games,,,,,0.00m,,,0.00m,,2018-10-25,Visual Novel,2018
58690,234,Enkan no Memoria: Kakera Tomoshi,PSV,Dramatic Create,A'sRing,,,,,0.00m,,,0.00m,,2018-03-29,Visual Novel,2018
58691,235,Disorder 6,X360,5pb,5pb. Games,,,,,0.00m,,,0.00m,,2013-08-22,Visual Novel,2013


In [35]:
# Convert the release date data type to datetime64
renamed_df['release_date'] = renamed_df['release_date'].astype('datetime64[ns]')

In [36]:
# Confirm the release date data type to datetime64
renamed_df.dtypes

rank                       int64
game_name                 object
console                   object
publisher                 object
developer                 object
vgchartz_score           float64
critic_score             float64
user_score               float64
total_shipped             object
global_sales              object
na_sales                  object
pal_sales                 object
japan_sales               object
other_sales               object
release_date      datetime64[ns]
genre                     object
release_year               Int32
dtype: object

In [38]:
# Check the release date column values
renamed_df

Unnamed: 0,rank,game_name,console,publisher,developer,vgchartz_score,critic_score,user_score,total_shipped,global_sales,na_sales,pal_sales,japan_sales,other_sales,release_date,genre,release_year
0,1,God of War,Series,Sony Computer Entertainment,SIE Santa Monica Studio,,,,51.00m,,,,,,2005-03-22,Action,2005
1,2,Warriors,Series,KOEI,Omega Force,,,,45.26m,,,,,,1997-06-30,Action,1997
2,3,Devil May Cry,Series,Capcom,Capcom,,,,22.00m,,,,,,2001-10-16,Action,2001
3,4,Dynasty Warriors,Series,Unknown,Omega Force,,,,21.00m,,,,,,NaT,Action,
4,5,Grand Theft Auto V,PS3,Rockstar Games,Rockstar North,,9.4,,,20.32m,6.37m,9.85m,0.99m,3.12m,2013-09-17,Action,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58688,232,"Nora, Princess, and Stray Cat",NS,Harukaze,Harukaze,,,,,0.00m,,,0.00m,,2018-10-25,Visual Novel,2018
58689,233,Memories Off: Innocent File,NS,5pb,5pb. Games,,,,,0.00m,,,0.00m,,2018-10-25,Visual Novel,2018
58690,234,Enkan no Memoria: Kakera Tomoshi,PSV,Dramatic Create,A'sRing,,,,,0.00m,,,0.00m,,2018-03-29,Visual Novel,2018
58691,235,Disorder 6,X360,5pb,5pb. Games,,,,,0.00m,,,0.00m,,2013-08-22,Visual Novel,2013


# Convert Sales values to Float format

In [39]:
# strip out character m from total_shipped column
for index, row in renamed_df.loc[renamed_df['total_shipped'].notna()].iterrows():
    total_shipped = row[8]
    renamed_df.loc[index, 'total_shipped'] = total_shipped[:-1]

renamed_df

Unnamed: 0,rank,game_name,console,publisher,developer,vgchartz_score,critic_score,user_score,total_shipped,global_sales,na_sales,pal_sales,japan_sales,other_sales,release_date,genre,release_year
0,1,God of War,Series,Sony Computer Entertainment,SIE Santa Monica Studio,,,,51.00,,,,,,2005-03-22,Action,2005
1,2,Warriors,Series,KOEI,Omega Force,,,,45.26,,,,,,1997-06-30,Action,1997
2,3,Devil May Cry,Series,Capcom,Capcom,,,,22.00,,,,,,2001-10-16,Action,2001
3,4,Dynasty Warriors,Series,Unknown,Omega Force,,,,21.00,,,,,,NaT,Action,
4,5,Grand Theft Auto V,PS3,Rockstar Games,Rockstar North,,9.4,,,20.32m,6.37m,9.85m,0.99m,3.12m,2013-09-17,Action,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58688,232,"Nora, Princess, and Stray Cat",NS,Harukaze,Harukaze,,,,,0.00m,,,0.00m,,2018-10-25,Visual Novel,2018
58689,233,Memories Off: Innocent File,NS,5pb,5pb. Games,,,,,0.00m,,,0.00m,,2018-10-25,Visual Novel,2018
58690,234,Enkan no Memoria: Kakera Tomoshi,PSV,Dramatic Create,A'sRing,,,,,0.00m,,,0.00m,,2018-03-29,Visual Novel,2018
58691,235,Disorder 6,X360,5pb,5pb. Games,,,,,0.00m,,,0.00m,,2013-08-22,Visual Novel,2013


In [40]:
# strip out character m from global_sales column
for index, row in renamed_df.loc[renamed_df['global_sales'].notna()].iterrows():
    global_sales = row[9]
    renamed_df.loc[index, 'global_sales'] = global_sales[:-1]

renamed_df

Unnamed: 0,rank,game_name,console,publisher,developer,vgchartz_score,critic_score,user_score,total_shipped,global_sales,na_sales,pal_sales,japan_sales,other_sales,release_date,genre,release_year
0,1,God of War,Series,Sony Computer Entertainment,SIE Santa Monica Studio,,,,51.00,,,,,,2005-03-22,Action,2005
1,2,Warriors,Series,KOEI,Omega Force,,,,45.26,,,,,,1997-06-30,Action,1997
2,3,Devil May Cry,Series,Capcom,Capcom,,,,22.00,,,,,,2001-10-16,Action,2001
3,4,Dynasty Warriors,Series,Unknown,Omega Force,,,,21.00,,,,,,NaT,Action,
4,5,Grand Theft Auto V,PS3,Rockstar Games,Rockstar North,,9.4,,,20.32,6.37m,9.85m,0.99m,3.12m,2013-09-17,Action,2013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58688,232,"Nora, Princess, and Stray Cat",NS,Harukaze,Harukaze,,,,,0.00,,,0.00m,,2018-10-25,Visual Novel,2018
58689,233,Memories Off: Innocent File,NS,5pb,5pb. Games,,,,,0.00,,,0.00m,,2018-10-25,Visual Novel,2018
58690,234,Enkan no Memoria: Kakera Tomoshi,PSV,Dramatic Create,A'sRing,,,,,0.00,,,0.00m,,2018-03-29,Visual Novel,2018
58691,235,Disorder 6,X360,5pb,5pb. Games,,,,,0.00,,,0.00m,,2013-08-22,Visual Novel,2013
