In [77]:
import plotly.express as px
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing

In [9]:
df = pd.read_csv('video_games_sales.csv')
df

Unnamed: 0,rank,name,platform,year,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [None]:
## Below will show sales based on platforms. This can show publishers which consoles are the most popular. Based on the 
## histogram for platform and global sales, PlayStation (PS) platforms are significantly more popular than Xbox (XB, X360, 
## XOne) with the exception of X360.

In [83]:
fig = px.histogram(x=df["platform"], y=df["global_sales"])
fig.show()

In [None]:
## This shows sales based on region. It's interesting because each region seems to have different preferences. In the US, Xbox
## is fairly popular whereas in Japan, it's almost nonexistant. Japan also has a fondness for the Nintendo DS whereas the DS is
## not nearly as popular in the other regions. 

In [13]:
fig = px.histogram(x=df["platform"], y=df["na_sales"])
fig.show()

In [14]:
fig = px.histogram(x=df["platform"], y=df["eu_sales"])
fig.show()

In [15]:
fig = px.histogram(x=df["platform"], y=df["jp_sales"])
fig.show()

In [58]:
fig = px.histogram(x=df["genre"], y=df["global_sales"])
fig.show()

In [None]:
## This histogram uses year for x and global_sales for y. Using "plaform" as color allows us to see gaming trends based on 
## platform. For example, PlayStation seems to be the preferred console for most generations. However, comparing PS3 vs 360 
## shows that 360 starts off stronger but as the years progress, PS3 sales start to catch up after a certain point. What could 
## make this graph easier to navigate would be choosing different colors for some platforms that are similar in tone. 

In [60]:
fig = px.histogram(x=df["year"], y=df["global_sales"], color=df["platform"])
fig.show()

In [None]:
## A general pie graph to show a breakdown of this database's games sales. PS2 seems to have the highest impact during its time
## and possibly due to the competition at the time, there was basically only one choice. PS3 and Xbox 360 being back and forth
## probably made it the overall totals for each other smaller because both consoles were popular and direct competitors.

In [34]:
fig = px.pie(df, values='global_sales', names='platform')
fig.show()

In [None]:
## Sorting all these objects as treemaps allows us to see which genres are the most popular on each console. This can show 
## publishers which types of games do well on which platform. For example, on the Xbox 360, there is a big emphasis on shooters
## compared to other consoles. A publisher who does mostly shooters may want to focus marketing towards the Xbox brand because 
## it might be overshadowed on a different platform where shooters are not as popular.


In [53]:
fig = px.treemap(df, path=[px.Constant("all"), 'platform', 'genre'], values='global_sales')
fig.show()

In [None]:
## The following scatter plots were chosen to see how each region effects the overall global sales. Using corrcoef, it is 
## seen that na_sales has the highest correlation meaning na_sales has the most effect on global sales. It can be seen based
## on how closely all the plots form a line. 94% is a significantly strong correlation.

In [90]:
fig = px.scatter(x=df["na_sales"], y=df["global_sales"])
fig.show()

In [65]:
np.corrcoef(df['na_sales'], df['global_sales'])

array([[1.        , 0.94104736],
       [0.94104736, 1.        ]])

In [91]:
fig = px.scatter(x=df["eu_sales"], y=df["global_sales"])
fig.show()

In [92]:
np.corrcoef(df['eu_sales'], df['global_sales'])

array([[1.        , 0.90283581],
       [0.90283581, 1.        ]])

In [93]:
fig = px.scatter(x=df["jp_sales"], y=df["global_sales"])
fig.show()

In [67]:
np.corrcoef(df['jp_sales'], df['global_sales'])

array([[1.        , 0.61181552],
       [0.61181552, 1.        ]])

In [68]:
## I decided to use the groupby function on both "platform" and "year". Doing so shows which platform has the most sales and can
## show publishers which consoles are the more preferred ones per console generation. Groupby for year allows us to see which
## years games are the most popular. This can show publishers games during a certain era were more popular for one reason or
## another. This could lead to publsihers looking to potentially remaster/remake certain games or emulate the formula for 
## a specific era of gaming.

In [36]:
df.groupby(by="platform").sum()

Unnamed: 0_level_0,rank,year,na_sales,eu_sales,jp_sales,other_sales,global_sales
platform,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2600,585665,229928.0,90.6,5.47,0.0,0.91,97.08
3DO,43118,5984.0,0.0,0.0,0.1,0.0,0.1
3DS,4662644,1006531.0,78.87,58.52,97.35,12.63,247.46
DC,456114,103997.0,5.43,1.69,8.56,0.27,15.97
DS,20845831,4283493.0,390.71,194.65,175.57,60.53,822.49
GB,332419,193608.0,114.32,47.82,85.12,8.2,255.45
GBA,7136749,1624604.0,187.54,75.25,47.33,7.73,318.5
GC,4817401,1085843.0,133.46,38.71,21.58,5.18,199.36
GEN,190019,53812.0,19.27,5.52,2.67,0.89,28.36
GG,13527,1992.0,0.0,0.0,0.04,0.0,0.04


In [37]:
df.groupby(by="year").sum()

Unnamed: 0_level_0,rank,na_sales,eu_sales,jp_sales,other_sales,global_sales
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1980.0,29826,10.59,0.67,0.0,0.12,11.38
1981.0,190488,33.4,1.96,0.0,0.32,35.77
1982.0,149186,26.92,1.65,0.0,0.31,28.86
1983.0,56759,7.76,0.8,8.1,0.14,16.79
1984.0,22911,33.28,2.1,14.27,0.7,50.36
1985.0,55505,33.73,4.74,14.56,0.92,53.94
1986.0,35986,12.5,2.84,19.81,1.93,37.07
1987.0,54701,8.46,1.41,11.63,0.2,21.74
1988.0,37181,23.87,6.59,15.76,0.99,47.22
1989.0,40156,45.15,8.44,18.36,1.5,73.45


In [69]:
df.groupby(by="genre").sum()

Unnamed: 0_level_0,rank,year,na_sales,eu_sales,jp_sales,other_sales,global_sales
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Action,26441383,6531731.0,877.83,525.0,159.95,187.38,1751.18
Adventure,14831165,2562375.0,105.8,64.13,52.07,16.81,239.04
Fighting,6484242,1675871.0,223.59,101.32,87.35,36.68,448.91
Misc,14889052,3432412.0,410.24,215.98,107.76,75.32,809.96
Platform,6137545,1755347.0,447.05,201.63,130.77,51.59,831.37
Puzzle,5603136,1144994.0,123.78,50.78,57.31,12.55,244.95
Racing,9943933,2457934.0,359.42,238.39,56.69,77.27,732.04
Role-Playing,12032228,2952379.0,327.28,188.06,352.31,59.61,927.37
Shooter,9653872,2571588.0,582.6,313.27,38.28,102.69,1037.37
Simulation,7478816,1707589.0,183.31,113.38,63.7,31.52,392.2


In [70]:
## Logistic Regression

In [72]:
X = df['platform']
y = df['global_sales']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=101)
LogR_model = LogisticRegression(max_iter=10000)
LogR_model.fit(X_train, y_train)
LogR_predictions = LogR_model.predict(X_test)

ValueError: could not convert string to float: 'DS'