In [6]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import tensorflow as tf

#  Import and read the charity_data.csv.
import pandas as pd 
vid_game_df = pd.read_csv("Resources/merged_data_4.csv")
vid_game_df.head()

Unnamed: 0,Title,Release Date,Team,Rating,Number of Reviews,Plays,Playing,rank,platform,genre,publisher,na_sales,eu_sales,jp_sales,other_sales,global_sales
0,Elden Ring,"Feb 25, 2022","['Bandai Namco Entertainment', 'FromSoftware']",4.5,3.9K,17K,3.8K,,,,,,,,,
1,Hades,"Dec 10, 2019",['Supergiant Games'],4.3,2.9K,21K,3.2K,,,,,,,,,
2,The Legend of Zelda: Breath of the Wild,"Mar 03, 2017","['Nintendo', 'Nintendo EPD Production Group No...",4.4,4.3K,30K,2.5K,,,,,,,,,
3,Undertale,"Sep 15, 2015","['tobyfox', '8-4']",4.2,3.5K,28K,679,,,,,,,,,
4,Hollow Knight,"Feb 24, 2017",['Team Cherry'],4.4,3K,21K,2.4K,,,,,,,,,


In [7]:
# Determine the number of unique values in each column.
unique_vals= vid_game_df.nunique()
print(unique_vals)

Title                12146
Release Date           972
Team                   757
Rating                  35
Number of Reviews      600
Plays                  255
Playing                394
rank                 11493
platform                31
genre                   12
publisher              567
na_sales               387
eu_sales               286
jp_sales               242
other_sales            146
global_sales           588
dtype: int64


<h2 style="font-size: 24px;">Using a Logistic Regression to Predict Video Game Success</h2>

In [32]:
# Define Success
vid_game_df["Success"] = (vid_game_df["Rating"] > 4.0).astype(int)

# Print first 5 rows
print(vid_game_df.head())

                                     Title  Release Date  \
0                               Elden Ring  Feb 25, 2022   
1                                    Hades  Dec 10, 2019   
2  The Legend of Zelda: Breath of the Wild  Mar 03, 2017   
3                                Undertale  Sep 15, 2015   
4                            Hollow Knight  Feb 24, 2017   

                                                Team  Rating  \
0     ['Bandai Namco Entertainment', 'FromSoftware']     4.5   
1                               ['Supergiant Games']     4.3   
2  ['Nintendo', 'Nintendo EPD Production Group No...     4.4   
3                                 ['tobyfox', '8-4']     4.2   
4                                    ['Team Cherry']     4.4   

  Number of Reviews Plays Playing  rank platform genre publisher  na_sales  \
0              3.9K   17K    3.8K   NaN      NaN   NaN       NaN       NaN   
1              2.9K   21K    3.2K   NaN      NaN   NaN       NaN       NaN   
2              4.3K 

In [9]:
# How much missing data do we have?
nan_counts= vid_game_df.isna().sum()

print(nan_counts)

Title                    0
Release Date         11047
Team                 11048
Rating               11060
Number of Reviews    11047
Plays                11047
Playing              11047
rank                   641
platform               641
genre                  641
publisher              692
na_sales               641
eu_sales               641
jp_sales               641
other_sales            641
global_sales           641
Success                  0
dtype: int64


In [10]:
# Is this statistically significant?
row_count= vid_game_df.shape[0]
print(row_count)

12146


In [12]:
# Row count percentage of nan values relative to absolute number of rows
row_percent= (nan_counts / row_count) * 100
print(row_percent)


Title                 0.000000
Release Date         90.951754
Team                 90.959987
Rating               91.058785
Number of Reviews    90.951754
Plays                90.951754
Playing              90.951754
rank                  5.277458
platform              5.277458
genre                 5.277458
publisher             5.697349
na_sales              5.277458
eu_sales              5.277458
jp_sales              5.277458
other_sales           5.277458
global_sales          5.277458
Success               0.000000
dtype: float64


In [33]:
# Clean data to predict 'Rating' based on 'Success', 'Number of Reviews', 'Plays', and 'Playing'
clean_rating= vid_game_df.drop(columns=['rank', 'platform', 'genre', 'publisher', 'na_sales', 'eu_sales', 'jp_sales', 'other_sales', 'global_sales'])
clean_rating.head()
clean_rating.tail()

Unnamed: 0,Title,Release Date,Team,Rating,Number of Reviews,Plays,Playing,Success
12141,Chou Ezaru wa Akai Hana: Koi wa Tsuki ni Shiru...,,,,,,,0
12142,Eiyuu Densetsu: Sora no Kiseki Material Collec...,,,,,,,0
12143,Plushees,,,,,,,0
12144,Woody Woodpecker in Crazy Castle 5,,,,,,,0
12145,Know How 2,,,,,,,0


In [17]:
# Drop the rows with missing data, see what the total row count is afterwards
cleaned_df= clean_rating.dropna()
print(cleaned_df.shape)

(1085, 8)


In [22]:
# To try the logisitic regression, I will end up having to drop
# the 'Title', 'Release Date', and 'Team' as logisitic regressions cannot compute categorical data
cleaned_df=cleaned_df.drop(columns=['Title'])
cleaned_df=cleaned_df.drop(columns=['Release Date', 'Team'])
cleaned_df.head()

Unnamed: 0,Rating,Number of Reviews,Plays,Playing,Success
0,4.5,3.9K,17K,3.8K,1
1,4.3,2.9K,21K,3.2K,1
2,4.4,4.3K,30K,2.5K,1
3,4.2,3.5K,28K,679,1
4,4.4,3K,21K,2.4K,1


In [24]:
# Convert everything to an integer
cleaned_df['Plays'] = cleaned_df['Plays'].apply(lambda x: int(float(x[:-1]) * 1000) if 'K' in x else int(x))
cleaned_df['Playing'] = cleaned_df['Playing'].apply(lambda x: int(float(x[:-1]) * 1000) if 'K' in x else int(x))
cleaned_df['Number of Reviews'] = cleaned_df['Number of Reviews'].apply(lambda x: int(float(x[:-1]) * 1000) if 'K' in x else int(x))
cleaned_df.head()

Unnamed: 0,Rating,Number of Reviews,Plays,Playing,Success
0,4.5,3900,17000,3800,1
1,4.3,2900,21000,3200,1
2,4.4,4300,30000,2500,1
3,4.2,3500,28000,679,1
4,4.4,3000,21000,2400,1


In [18]:
# import additional necessary libraries
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [25]:
# Define X and y
X = cleaned_df.drop('Success', axis=1)
y = cleaned_df['Success']

# Split the preprocessed data into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create logisitic regression model
logreg = LogisticRegression()

# Fit the model using the training data
logreg.fit(X_train, y_train)

# Make predicitions using the testing data
y_pred= logreg.predict(X_test)

# Evaluate the performance of teh model using the accuracy score
accuracy= accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9969325153374233


<h2 style="font-size: 24px;">Using Decision Tree To Predict Video Game Success</h2>

In [26]:
# import additional libraries
from sklearn.tree import DecisionTreeClassifier

In [34]:
# Drop title from cleaned_df since it is a string and cannot be used.
tree_clean= clean_rating.drop('Title', axis=1)

In [36]:
# Looks like I need to drop 'Release Date' too.
tree_clean.drop('Release Date', axis=1, inplace=True)

In [38]:
# And 'Team' it seems
tree_clean.drop('Team', axis=1, inplace=True)

In [44]:
# Drop NaN datapoints
tree_clean.dropna(inplace=True)
print(tree_clean.shape)

(1086, 5)


In [45]:
# Don't forget to convert everything to an integer
tree_clean['Plays'] = tree_clean['Plays'].apply(lambda x: int(float(x[:-1]) * 1000) if 'K' in x else int(x))
tree_clean['Playing'] = tree_clean['Playing'].apply(lambda x: int(float(x[:-1]) * 1000) if 'K' in x else int(x))
tree_clean['Number of Reviews'] = tree_clean['Number of Reviews'].apply(lambda x: int(float(x[:-1]) * 1000) if 'K' in x else int(x))
tree_clean.head()

Unnamed: 0,Rating,Number of Reviews,Plays,Playing,Success
0,4.5,3900,17000,3800,1
1,4.3,2900,21000,3200,1
2,4.4,4300,30000,2500,1
3,4.2,3500,28000,679,1
4,4.4,3000,21000,2400,1


In [46]:
# Separate target variable and the features
X = tree_clean.drop('Success', axis=1)
y = tree_clean['Success']

# Split the data inot training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the decision tree
dtc= DecisionTreeClassifier()

# Train the model on the training data
dtc.fit(X_train, y_train)

# Predict on the test data
y_pred = dtc.predict(X_test)

# Evaluate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0
