In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

Reading and Cleaning data

In [28]:
df = pd.read_csv("./Data/steam.csv")
df

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27070,1065230,Room of Pandora,2019-04-24,1,SHEN JIAWEI,SHEN JIAWEI,windows,0,Single-player;Steam Achievements,Adventure;Casual;Indie,Adventure;Indie;Casual,7,3,0,0,0,0-20000,2.09
27071,1065570,Cyber Gun,2019-04-23,1,Semyon Maximov,BekkerDev Studio,windows,0,Single-player,Action;Adventure;Indie,Action;Indie;Adventure,0,8,1,0,0,0-20000,1.69
27072,1065650,Super Star Blast,2019-04-24,1,EntwicklerX,EntwicklerX,windows,0,Single-player;Multi-player;Co-op;Shared/Split ...,Action;Casual;Indie,Action;Indie;Casual,24,0,1,0,0,0-20000,3.99
27073,1066700,New Yankee 7: Deer Hunters,2019-04-17,1,Yustas Game Studio,Alawar Entertainment,windows;mac,0,Single-player;Steam Cloud,Adventure;Casual;Indie,Indie;Casual;Adventure,0,2,0,0,0,0-20000,5.19


Checking if there are any null value in the coulumn But since there isn't any null value there is no need to clean it yet

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27075 entries, 0 to 27074
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   appid             27075 non-null  int64  
 1   name              27075 non-null  object 
 2   release_date      27075 non-null  object 
 3   english           27075 non-null  int64  
 4   developer         27075 non-null  object 
 5   publisher         27075 non-null  object 
 6   platforms         27075 non-null  object 
 7   required_age      27075 non-null  int64  
 8   categories        27075 non-null  object 
 9   genres            27075 non-null  object 
 10  steamspy_tags     27075 non-null  object 
 11  achievements      27075 non-null  int64  
 12  positive_ratings  27075 non-null  int64  
 13  negative_ratings  27075 non-null  int64  
 14  average_playtime  27075 non-null  int64  
 15  median_playtime   27075 non-null  int64  
 16  owners            27075 non-null  object

Changing the datatype of the release date from string to Datetime

In [30]:
df['release_date'] = pd.to_datetime(df['release_date'])

In [31]:
df.nunique()

appid               27075
name                27033
release_date         2619
english                 2
developer           17113
publisher           14354
platforms               7
required_age            6
categories           3333
genres               1552
steamspy_tags        6423
achievements          410
positive_ratings     2800
negative_ratings     1492
average_playtime     1345
median_playtime      1312
owners                 13
price                 282
dtype: int64

Plotting pie chart to see most popular genre

In [63]:
genres_set = set()
for i in df.genres.str.split(';'):
    genres_set.update(i)
genres_set
d = dict()
genre_sets = df.genres.str.split(';').apply(set)
for genre in genres_set:
    d[genre] = genre_sets.apply(lambda row: genre in row)
df = df.assign(**d)
genres_count = df[genres_set].sum()
genres_count = genres_count.sort_values(ascending=False)

percentage = genres_count / len(df)

data = {'genres': genres_count.index, 'percentage': percentage}
df_pie = pd.DataFrame(data)

# Create the pie chart using Plotly
fig = px.pie(df_pie, values='percentage', names='genres')
fig.show()


Passing a set as an indexer is deprecated and will raise in a future version. Use a list instead.



In [75]:
age_dict = dict(df.required_age.value_counts())
labels = ['rated','unrated']
sizes = [age_dict[18] + age_dict[16] + age_dict[12] + age_dict[7] + age_dict[3], age_dict.get(0)]
explode = (0.1, 0, 0.1, 0.05, 0.2)  # explode 1st slice

fig = go.Figure(data=[go.Pie(labels=labels, values=sizes, textinfo='label+percent', hole=0.3, 
                             pull=explode, marker=dict(colors=['gold', 'lightskyblue']))])

fig.update_layout(title='Age Rating of Games')
fig.show()
age_dict = dict(df.required_age.value_counts())
labels = ['18+', '16+', '12+', '7+', '3+']
sizes = [age_dict.get(18, 0), age_dict.get(16, 0), age_dict.get(12, 0), age_dict.get(7, 0), age_dict.get(3, 0)]
explode = (0.1, 0, 0.1, 0.05, 0.2) 

fig2 = go.Figure(data=[go.Pie(labels=labels, values=sizes, textinfo='label+percent', hole=0.3, 
                             pull=explode, marker=dict(colors=['gold', 'lightskyblue']))])

fig2.update_layout(title='Age Rating of Games with Rating')
fig2.show()

Plotting top 50 publisher with the most game developed

In [66]:
top=50
fig = px.bar(df['developer'].value_counts().iloc[:top], orientation='h')

# Set the title
fig.update_layout(title=f'Top {top} developers with most games')

# Show the bar chart
fig.show()

In [78]:
yearly = df.groupby(df.release_date.dt.year.rename('release_year')).agg('count').appid.rename('count')

sep_year = 2018

yearly_part = yearly[yearly.index <= sep_year]

fig = go.Figure(data=[go.Bar(x=yearly_part.index, y=yearly_part, marker_color='lightskyblue')])

fig.update_layout(
    title=f'Total games released in {sep_year}-2019 years',
    xaxis=dict(
        title='Year',
        tickmode='linear',
        tick0=min(yearly_part.index),
        dtick=1
    ),
    yaxis=dict(
        title='Total games'
    )
)

fig.show()