In [28]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [29]:
# Import our input dataset
file_path = Path('./Resources/vgsales-12-4-2019.csv')
vgsales_df = pd.read_csv(file_path)
vgsales_df.head()

Unnamed: 0,Rank,Name,basename,Genre,ESRB_Rating,Platform,Publisher,Developer,VGChartz_Score,Critic_Score,...,NA_Sales,PAL_Sales,JP_Sales,Other_Sales,Year,Last_Update,url,status,Vgchartzscore,img_url
0,1,Wii Sports,wii-sports,Sports,E,Wii,Nintendo,Nintendo EAD,,7.7,...,,,,,2006.0,,http://www.vgchartz.com/game/2667/wii-sports/?...,1,,/games/boxart/full_2258645AmericaFrontccc.jpg
1,2,Super Mario Bros.,super-mario-bros,Platform,,NES,Nintendo,Nintendo EAD,,10.0,...,,,,,1985.0,,http://www.vgchartz.com/game/6455/super-mario-...,1,,/games/boxart/8972270ccc.jpg
2,3,Mario Kart Wii,mario-kart-wii,Racing,E,Wii,Nintendo,Nintendo EAD,,8.2,...,,,,,2008.0,11th Apr 18,http://www.vgchartz.com/game/6968/mario-kart-w...,1,8.7,/games/boxart/full_8932480AmericaFrontccc.jpg
3,4,PlayerUnknown's Battlegrounds,playerunknowns-battlegrounds,Shooter,,PC,PUBG Corporation,PUBG Corporation,,,...,,,,,2017.0,13th Nov 18,http://www.vgchartz.com/game/215988/playerunkn...,1,,/games/boxart/full_8052843AmericaFrontccc.jpg
4,5,Wii Sports Resort,wii-sports-resort,Sports,E,Wii,Nintendo,Nintendo EAD,,8.0,...,,,,,2009.0,,http://www.vgchartz.com/game/24656/wii-sports-...,1,8.8,/games/boxart/full_7295041AmericaFrontccc.jpg


In [30]:
vgsales_df.columns.tolist()

['Rank',
 'Name',
 'basename',
 'Genre',
 'ESRB_Rating',
 'Platform',
 'Publisher',
 'Developer',
 'VGChartz_Score',
 'Critic_Score',
 'User_Score',
 'Total_Shipped',
 'Global_Sales',
 'NA_Sales',
 'PAL_Sales',
 'JP_Sales',
 'Other_Sales',
 'Year',
 'Last_Update',
 'url',
 'status',
 'Vgchartzscore',
 'img_url']

In [31]:
vgsales_df.dtypes

Rank                int64
Name               object
basename           object
Genre              object
ESRB_Rating        object
Platform           object
Publisher          object
Developer          object
VGChartz_Score    float64
Critic_Score      float64
User_Score        float64
Total_Shipped     float64
Global_Sales      float64
NA_Sales          float64
PAL_Sales         float64
JP_Sales          float64
Other_Sales       float64
Year              float64
Last_Update        object
url                object
status              int64
Vgchartzscore     float64
img_url            object
dtype: object

In [32]:
vgsales_df.count()

Rank              55792
Name              55792
basename          55792
Genre             55792
ESRB_Rating       23623
Platform          55792
Publisher         55792
Developer         55775
VGChartz_Score        0
Critic_Score       6536
User_Score          335
Total_Shipped      1827
Global_Sales      19415
NA_Sales          12964
PAL_Sales         13189
JP_Sales           7043
Other_Sales       15522
Year              54813
Last_Update        9186
url               55792
status            55792
Vgchartzscore       799
img_url           55792
dtype: int64

In [33]:
# Drop columns:
vg_df = vgsales_df.drop(['Rank', 'Last_Update', 'ESRB_Rating', 'Name',
                                   'status', 'VGChartz_Score','Vgchartzscore',
                                    'User_Score', 'Total_Shipped',
                                  'NA_Sales', 'PAL_Sales', 'JP_Sales', 'Other_Sales', 'url', 'img_url', 'basename'], axis = 1)
vg_df.head(5)

Unnamed: 0,Genre,Platform,Publisher,Developer,Critic_Score,Global_Sales,Year
0,Sports,Wii,Nintendo,Nintendo EAD,7.7,,2006.0
1,Platform,NES,Nintendo,Nintendo EAD,10.0,,1985.0
2,Racing,Wii,Nintendo,Nintendo EAD,8.2,,2008.0
3,Shooter,PC,PUBG Corporation,PUBG Corporation,,,2017.0
4,Sports,Wii,Nintendo,Nintendo EAD,8.0,,2009.0


In [34]:
vg_df.count()

Genre           55792
Platform        55792
Publisher       55792
Developer       55775
Critic_Score     6536
Global_Sales    19415
Year            54813
dtype: int64

In [35]:
# Find null value
for column in vg_df.columns:
    print(f'Column {column} has {vg_df[column].isnull().sum()} null values')

Column Genre has 0 null values
Column Platform has 0 null values
Column Publisher has 0 null values
Column Developer has 17 null values
Column Critic_Score has 49256 null values
Column Global_Sales has 36377 null values
Column Year has 979 null values


In [36]:
vg_df = vg_df.dropna()

In [37]:
# Find null value
for column in vg_df.columns:
    print(f'Column {column} has {vg_df[column].isnull().sum()} null values')

Column Genre has 0 null values
Column Platform has 0 null values
Column Publisher has 0 null values
Column Developer has 0 null values
Column Critic_Score has 0 null values
Column Global_Sales has 0 null values
Column Year has 0 null values


In [38]:
vg_df.shape

(4273, 7)

In [39]:
vg_df['Successful'] = np.where(vg_df['Critic_Score'] >= 7.0, int('0'), int('1'))
vg_df.head()

Unnamed: 0,Genre,Platform,Publisher,Developer,Critic_Score,Global_Sales,Year,Successful
19,Action,PS3,Rockstar Games,Rockstar North,9.4,20.32,2013.0,0
20,Action,PS4,Rockstar Games,Rockstar North,9.7,19.39,2014.0,0
30,Action,PS2,Rockstar Games,Rockstar North,9.6,16.15,2002.0,0
40,Shooter,X360,Activision,Infinity Ward,8.7,14.82,2011.0,0
41,Shooter,X360,Activision,Treyarch,8.8,14.74,2010.0,0


In [40]:
vg_df.dtypes

Genre            object
Platform         object
Publisher        object
Developer        object
Critic_Score    float64
Global_Sales    float64
Year            float64
Successful        int32
dtype: object

### Encode text lables into numerical values

In [41]:
vg_binary_encoded = pd.get_dummies(vg_df, columns=['Genre', 'Platform', 'Publisher', 'Developer'])
vg_binary_encoded.head()

Unnamed: 0,Critic_Score,Global_Sales,Year,Successful,Genre_Action,Genre_Action-Adventure,Genre_Adventure,Genre_Board Game,Genre_Education,Genre_Fighting,...,Developer_id Software,Developer_id Software / Raven Software,Developer_imageepoch Inc.,Developer_n-Space,Developer_neo Software,Developer_skip Ltd.,Developer_syn Sophia,Developer_tri-Ace,Developer_tri-Crescendo / Monolith Soft,Developer_zSlide
19,9.4,20.32,2013.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,9.7,19.39,2014.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,9.6,16.15,2002.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40,8.7,14.82,2011.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41,8.8,14.74,2010.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
# Scale and standardize data: mean is ), std dev is 1
from sklearn.preprocessing import StandardScaler
data_scaler = StandardScaler()

In [43]:
# Train the scaler and transform the data
vg_data_scaled = data_scaler.fit_transform(vg_binary_encoded)

In [44]:
# Preview the scaled data
vg_data_scaled[:5]

array([[ 1.57668101, 14.14151079,  1.21775094, ..., -0.05080303,
        -0.02163965, -0.01529975],
       [ 1.7863523 , 13.46928577,  1.42759284, ..., -0.05080303,
        -0.02163965, -0.01529975],
       [ 1.71646187, 11.12734055, -1.09051003, ..., -0.05080303,
        -0.02163965, -0.01529975],
       [ 1.08744799, 10.16598649,  0.79806712, ..., -0.05080303,
        -0.02163965, -0.01529975],
       [ 1.15733842, 10.10816068,  0.58822522, ..., -0.05080303,
        -0.02163965, -0.01529975]])

In [45]:
# Verify that the mean of each column is 0 and its standard deviation is 1:
print(np.mean(vg_data_scaled[:,0]))
print(np.std(vg_data_scaled[:,0]))

4.522996118107822e-16
1.0


In [46]:
#### Mean of 1st column -3.552436144726694e-17 approximates 0
#### Standard deviation is 1.0
#### Standardization is successful

In [47]:
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [48]:
# Define the features set.
X = vg_binary_encoded.copy()
X = X.drop("Successful", axis=1)
X.head()

Unnamed: 0,Critic_Score,Global_Sales,Year,Genre_Action,Genre_Action-Adventure,Genre_Adventure,Genre_Board Game,Genre_Education,Genre_Fighting,Genre_MMO,...,Developer_id Software,Developer_id Software / Raven Software,Developer_imageepoch Inc.,Developer_n-Space,Developer_neo Software,Developer_skip Ltd.,Developer_syn Sophia,Developer_tri-Ace,Developer_tri-Crescendo / Monolith Soft,Developer_zSlide
19,9.4,20.32,2013.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
20,9.7,19.39,2014.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
30,9.6,16.15,2002.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
40,8.7,14.82,2011.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
41,8.8,14.74,2010.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Define the target set.
y = vg_binary_encoded["Successful"].ravel()
y[:5]

array([0, 0, 0, 0, 0])

In [50]:
# Split the Data Into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [51]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [52]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78) 

In [53]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [54]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 0, 0, ..., 0, 1, 0])

In [137]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
acc_score

1.0

In [55]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,685,0
Actual 1,2,382


In [56]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,685,0
Actual 1,2,382


Accuracy Score : 0.9981290926099158
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       685
           1       1.00      0.99      1.00       384

    accuracy                           1.00      1069
   macro avg       1.00      1.00      1.00      1069
weighted avg       1.00      1.00      1.00      1069



In [57]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([6.37533365e-01, 5.94903602e-02, 2.61384342e-02, ...,
       1.76731573e-04, 1.51519276e-05, 2.90930747e-05])

In [58]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.6375333650901405, 'Critic_Score'),
 (0.05949036017511359, 'Global_Sales'),
 (0.026138434184672942, 'Year'),
 (0.006300027754266618, 'Genre_Action'),
 (0.004431077133123038, 'Platform_PC'),
 (0.004169132432435017, 'Publisher_EA Sports'),
 (0.003660093044932404, 'Platform_GBA'),
 (0.0034127213885418507, 'Platform_PSP'),
 (0.003371477585373622, 'Platform_X360'),
 (0.003308445796074598, 'Platform_Wii'),
 (0.0032983110850718245, 'Genre_Platform'),
 (0.003151371507586761, 'Publisher_Ubisoft'),
 (0.0030427951689630827, 'Genre_Shooter'),
 (0.002845165829230767, 'Publisher_Sega'),
 (0.002641674085499974, 'Genre_Role-Playing'),
 (0.0026381066006092497, 'Genre_Sports'),
 (0.002597234380016112, 'Platform_DS'),
 (0.002593032430425101, 'Platform_PS3'),
 (0.0025640512911133287, 'Publisher_Activision'),
 (0.0025623369954838973, 'Developer_EA Tiburon'),
 (0.0024682309313941537, 'Genre_Racing'),
 (0.0023774647440872737, 'Publisher_Sony Computer Entertainment'),
 (0.0023642616528903137, 'Publisher_El