In [67]:
import plotly.express as px
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import preprocessing

In [68]:
df = pd.read_csv('Cleaned Data 2.csv')
df

Unnamed: 0,Name,Year_of_Release,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Developer,Rating
0,.hack//Infection Part 1,2002,Role-Playing,Atari,0.49,0.38,0.26,0.13,1.27,75,35,8.5,60,CyberConnect2,T
1,.hack//Mutation Part 2,2002,Role-Playing,Atari,0.23,0.18,0.20,0.06,0.68,76,24,8.9,81,CyberConnect2,T
2,.hack//Outbreak Part 3,2002,Role-Playing,Atari,0.14,0.11,0.17,0.04,0.46,70,23,8.7,19,CyberConnect2,T
3,[Prototype],2009,Action,Activision,0.84,0.35,0.00,0.12,1.31,78,83,7.8,356,Radical Entertainment,M
4,[Prototype],2009,Action,Activision,0.65,0.40,0.00,0.19,1.24,79,53,7.7,308,Radical Entertainment,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6889,Zubo,2008,Misc,Electronic Arts,0.08,0.02,0.00,0.01,0.11,75,19,7.6,75,EA Bright Light,E10+
6890,Zumba Fitness,2010,Sports,505 Games,1.74,0.45,0.00,0.18,2.37,42,10,5.5,16,"Pipeworks Software, Inc.",E
6891,Zumba Fitness: World Party,2013,Misc,Majesco Entertainment,0.17,0.05,0.00,0.02,0.24,73,5,6.2,40,Zoe Mode,E
6892,Zumba Fitness Core,2012,Misc,505 Games,0.00,0.05,0.00,0.00,0.05,77,6,6.7,6,Zoe Mode,E10+


In [69]:
## This shows sales based on genre. It's interesting because each region seems to have different preferences. Action as a genre
## is the overall popular genre based on the data. In both NA and EU, Action is the number one most sold. However in Japan, 
## RPGs are the most popular. Having this information can help publisher know what game would be worth marketing and where. Can
## also help teams come up with realistic sales goals based on regions.

In [70]:
fig = px.histogram(x=df["Genre"], y=df["Global_Sales"])
fig.show()

In [71]:
fig = px.histogram(x=df["Genre"], y=df["NA_Sales"])
fig.show()

In [72]:
fig = px.histogram(x=df["Genre"], y=df["EU_Sales"])
fig.show()

In [73]:
fig = px.histogram(x=df["Genre"], y=df["JP_Sales"])
fig.show()

In [74]:
## Below Histograms shows how well each game rating sells. No surprise but games rated E for Everyone has the highest sales
## likely due to them being appealing to all ages. Interestingly, M for Mature games are second. This is due to the sales in
## NA and EU regions. In JP region, M rated (or equivalent) games are not as popular likely due to the amount of censorship
## required for M rated games in JP.

In [75]:
fig = px.histogram(x=df["Rating"], y=df["Global_Sales"])
fig.show()

In [76]:
fig = px.histogram(x=df["Rating"], y=df["NA_Sales"])
fig.show()

In [77]:
fig = px.histogram(x=df["Rating"], y=df["EU_Sales"])
fig.show()

In [78]:
fig = px.histogram(x=df["Rating"], y=df["JP_Sales"])
fig.show()

In [79]:
fig = px.box(df, x="Genre", y="Critic_Score")
fig.show()

In [80]:
## This histogram uses "Year" for x and "Global_Sales" for y. Setting "Genre" as color also shows which genre begins to get 
## popular and when. After 2000, action games get a giant increase in popularity and starts to fade as the sales approach 
## 2015. Shooters begin getting popular after 2005 and kind of stays that way. 2006 shows a sports dominated year when it 
## comes to sales. Seeing these trends can let publishers know when and if it's smart to create a game in a specific genre

In [81]:
fig = px.histogram(x=df["Year_of_Release"], y=df["Global_Sales"], color=df["Genre"])
fig.show()

In [82]:
## A general pie graph to show a breakdown of this database's games sales. PS2 seems to have the highest impact during its time
## and possibly due to the competition at the time, there was basically only one choice. PS3 and Xbox 360 being back and forth
## probably made it the overall totals for each other smaller because both consoles were popular and direct competitors.

In [83]:
fig = px.pie(df, values='Global_Sales', names='Genre')
fig.show()

In [84]:
## Sorting the info into a treemap allows us to see which genres were most popular in which year. Action games dominated mostly,
## sports remains second except for a couple years where they take first (2006, 2009). Shooters saw a rise in popularity after
## 2004


In [85]:
fig = px.treemap(df, path=[px.Constant("all"), 'Year_of_Release', 'Genre'], values='Global_Sales')
fig.show()

In [86]:
## These scatterplots draw a correlation between critic scores and sales. All regions seem to have a positive correlation but a
## very weak one (between 15-24%). 

In [87]:
fig = px.scatter(x=df["Critic_Score"], y=df["Global_Sales"])
fig.show()

In [88]:
np.corrcoef(df['Critic_Score'], df['Global_Sales'])

array([[1.        , 0.23708297],
       [0.23708297, 1.        ]])

In [89]:
fig = px.scatter(x=df["Critic_Score"], y=df["NA_Sales"])
fig.show()

In [90]:
np.corrcoef(df['Critic_Score'], df['NA_Sales'])

array([[1.        , 0.23292211],
       [0.23292211, 1.        ]])

In [91]:
fig = px.scatter(x=df["Critic_Score"], y=df["EU_Sales"])
fig.show()

In [92]:
np.corrcoef(df['Critic_Score'], df['EU_Sales'])

array([[1.        , 0.21284289],
       [0.21284289, 1.        ]])

In [93]:
fig = px.scatter(x=df["Critic_Score"], y=df["JP_Sales"])
fig.show()

In [94]:
np.corrcoef(df['Critic_Score'], df['JP_Sales'])

array([[1.        , 0.14717766],
       [0.14717766, 1.        ]])

In [95]:
## Scatter plots below show how each markets effect Global Sales. NA and EU both have a 90+% correlation thus making a majority
## of global sales. JP only has about 60% correlation meaning those sales aren't as effective. Marketing teams can use this
## information to try to make a hard marketing push in both NA and EU because that seems to be where the majority of the customer
## base is.

In [96]:
fig = px.scatter(x=df["NA_Sales"], y=df["Global_Sales"])
fig.show()

In [97]:
np.corrcoef(df['NA_Sales'], df['Global_Sales'])

array([[1.        , 0.95578446],
       [0.95578446, 1.        ]])

In [98]:
fig = px.scatter(x=df["EU_Sales"], y=df["Global_Sales"])
fig.show()

In [99]:
np.corrcoef(df['EU_Sales'], df['Global_Sales'])

array([[1.       , 0.9392437],
       [0.9392437, 1.       ]])

In [100]:
fig = px.scatter(x=df["JP_Sales"], y=df["Global_Sales"])
fig.show()

In [101]:
np.corrcoef(df['JP_Sales'], df['Global_Sales'])

array([[1.        , 0.61333617],
       [0.61333617, 1.        ]])

In [102]:
## I decided to use the groupby function on both "platform" and "year". Doing so shows which platform has the most sales and can
## show publishers which consoles are the more preferred ones per console generation. Groupby for year allows us to see which
## years games are the most popular. This can show publishers games during a certain era were more popular for one reason or
## another. This could lead to publsihers looking to potentially remaster/remake certain games or emulate the formula for 
## a specific era of gaming.

In [103]:
df.groupby(by="Genre").sum()

Unnamed: 0_level_0,Year_of_Release,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Action,3301880,591.94,388.81,76.63,147.74,1205.65,111524,49546,11671.5,333279
Adventure,530091,38.9,25.64,8.74,8.13,81.52,17552,6489,1898.3,30378
Fighting,760705,136.39,60.51,27.91,25.14,250.02,26414,10946,2767.5,25199
Misc,775069,222.17,121.14,33.22,40.46,417.11,26022,9560,2640.4,12874
Platform,808555,193.6,108.56,43.09,32.53,377.8,28210,10794,2973.0,46180
Puzzle,236825,33.5,24.04,14.98,6.33,78.9,8342,3372,855.6,4066
Racing,1175729,226.41,165.29,28.55,58.45,478.61,40798,15103,4166.9,42962
Role-Playing,1435589,219.82,119.32,122.78,40.41,502.14,52069,23588,5449.0,236392
Shooter,1742644,448.87,261.38,18.57,87.96,816.92,61601,32166,6153.5,339758
Simulation,606022,92.24,67.88,26.9,17.24,204.25,21135,6991,2171.9,27780


In [104]:
df.groupby(by="Year_of_Release").sum()

Unnamed: 0_level_0,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count
Year_of_Release,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1985,0.0,0.03,0.0,0.01,0.03,59,9,5.8,19
1988,0.0,0.02,0.0,0.01,0.03,64,75,2.2,4572
1992,0.02,0.0,0.0,0.0,0.03,85,44,8.2,1796
1994,0.39,0.26,0.53,0.08,1.27,69,4,6.3,4
1996,8.05,6.98,4.06,1.26,20.35,719,91,67.2,4990
1997,15.46,8.75,9.75,2.09,36.02,1206,175,119.6,5384
1998,18.81,12.59,11.6,2.27,45.24,2120,319,221.3,4967
1999,23.32,15.69,9.67,2.45,51.17,2442,437,254.1,4262
2000,39.34,25.2,11.27,5.49,81.24,7362,1501,768.7,6284
2001,139.32,72.85,23.57,18.26,253.88,18618,4673,1944.5,15658


In [105]:
df.groupby(by="Critic_Score").sum()

Unnamed: 0_level_0,Year_of_Release,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Count,User_Score,User_Count
Critic_Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
13,2013,0.02,0.01,0.00,0.01,0.04,4,1.0,218
17,2009,0.06,0.03,0.00,0.01,0.10,11,1.7,37
19,12051,0.45,0.12,0.00,0.05,0.62,38,17.6,565
20,6029,1.25,0.45,0.00,0.17,1.87,23,6.7,123
21,2011,0.08,0.03,0.00,0.01,0.12,5,5.0,25
...,...,...,...,...,...,...,...,...,...
94,72189,68.54,36.24,9.08,12.65,126.44,1813,300.7,54489
95,32111,55.89,26.00,4.49,18.96,105.39,965,134.3,32828
96,36099,39.02,20.94,7.11,5.51,72.55,1005,159.3,42657
97,22076,50.68,36.25,4.09,11.11,102.11,598,90.4,18408


In [106]:
X = ## Logistic Regression

SyntaxError: invalid syntax (4174440220.py, line 1)

In [107]:
X = df['Global_Sales']
y = df['Critic_Score']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=101)
LogR_model = LogisticRegression()
LogR_model.fit(X_train, y_train)
LogR_predictions = LogR_model.predict(X_test)

ValueError: Expected 2D array, got 1D array instead:
array=[0.78 0.4  0.23 ... 0.61 0.11 0.75].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.