In [1]:
# Import our dependencies
import pandas as pd
import sklearn as skl
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Read in our ramen data
video_games_df = pd.read_csv("finalClean_games.csv")
video_games_df = video_games_df.dropna()
video_games_df = video_games_df.drop(['Game Name'], axis=1)
video_games_df.head()

Unnamed: 0,Franchise,Platform,Release Year,Years Since,First Release Year,Main Genre,Tier by Score,Critics,Critic_Score,Users,User Score,Developer,Global Sales,Other Info
0,#IDARB,Xbox,2015,0,2015,Action,Tier 3,31,77,88,6.9,OtherOceanInteractive,0.09,"Action,General"
1,007: Quantum of Solace,PlayStation,2008,0,2008,Action,Tier 4,42,65,59,6.6,Treyarch,1.14,"Action,General,Shooter,Shooter,First-Person,Mo..."
2,007: Quantum of Solace,PC,2008,0,2008,Action,Tier 3,18,70,64,6.1,"Treyarch,Beenox",0.03,"Action,General,Shooter,Shooter,First-Person,Mo..."
4,10 Second Ninja,PC,2014,0,2014,Action,Tier 3,13,72,16,7.1,GameDesignDan,0.02,"Action,Platformer,2D"
5,10 Second Ninja,Xbox,2016,2,2014,Action,Tier 2,10,82,8,4.5,GameDesignDan,0.11,"Action,Platformer,2D"


In [3]:
# Print out value counts for columns with many unique values
developer_counts = video_games_df.Developer.value_counts()
developer_counts

Codemasters                     33
UbisoftMontreal                 31
EASports                        26
EACanada                        24
Capcom                          22
                                ..
IRGurus                          1
IoInteractive,NixxesSoftware     1
ClapHanz                         1
1C:MaddoxGames                   1
SCE/WWS,SCEJapanStudio           1
Name: Developer, Length: 411, dtype: int64

In [4]:
# Determine which values to replace
replace_devs = list(developer_counts[developer_counts < 10].index)

# Replace in DataFrame
for devs in replace_devs:
    video_games_df.Developer = video_games_df.Developer.replace(devs,"Other")


# Check to make sure binning was successful
video_games_df.Developer.value_counts()

Other                    853
Codemasters               33
UbisoftMontreal           31
EASports                  26
EACanada                  24
Capcom                    22
VisualConcepts            21
Konami                    19
Traveller'sTales          18
EATiburon                 17
BlizzardEntertainment     17
Maxis                     16
Ubisoft                   16
BioWare                   15
EADICE                    15
Techland                  14
TTGames                   14
CreativeAssembly          14
GearboxSoftware           13
Relic                     13
InfinityWard              13
Treyarch                  11
FiraxisGames              11
Yuke's                    11
OmegaForce                10
ElectronicArts            10
SportsInteractive         10
Name: Developer, dtype: int64

In [5]:
# Generate our categorical variable lists
game_cat = video_games_df.dtypes[video_games_df.dtypes == "object"].index.tolist()
game_cat

['Franchise',
 'Platform',
 'Main Genre',
 'Tier by Score',
 'Developer',
 'Other Info']

In [6]:
# Check the number of unique values in each column
video_games_df[game_cat].nunique()

Franchise        424
Platform           3
Main Genre        13
Tier by Score      4
Developer         27
Other Info       398
dtype: int64

In [7]:
# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(video_games_df[game_cat]))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(game_cat)
encode_df.head()



Unnamed: 0,Franchise_#IDARB,Franchise_007: Quantum of Solace,Franchise_10 Second Ninja,Franchise_1701 A.D.,Franchise_3D Dot Game Heroes,Franchise_50 Cent: Blood on the Sand,Franchise_A Plague Tale: Innocence,Franchise_Ace Combat,Franchise_Agatha Christie,Franchise_Age of Conan,...,"Other Info_Strategy,Turn-Based,Tactics","Other Info_Strategy,Turn-Based,Wargame,Turn-Based,Fantasy,Artillery","Other Info_Strategy,Tycoon,Management,Business / Tycoon","Other Info_Strategy,Tycoon,Management,Tycoon,Business / Tycoon","Other Info_Strategy,Tycoon,Management,Tycoon,Government","Other Info_Strategy,Wargame,Turn-Based","Other Info_Traditional,Sports,Team,Baseball,Sim","Other Info_Tycoon,Business / Tycoon,Strategy,Management,Government","Other Info_Tycoon,Strategy,Management,Tycoon,Government,Business / Tycoon","Other Info_Virtual Life,Simulation,Miscellaneous,Virtual,Virtual Life"
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# Merge one-hot encoded features and drop the originals
video_games_df = video_games_df.merge(encode_df,left_index=True, right_index=True)
video_games_df = video_games_df.drop(game_cat,1)
video_games_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Release Year,Years Since,First Release Year,Critics,Critic_Score,Users,User Score,Global Sales,Franchise_#IDARB,Franchise_007: Quantum of Solace,...,"Other Info_Strategy,Turn-Based,Tactics","Other Info_Strategy,Turn-Based,Wargame,Turn-Based,Fantasy,Artillery","Other Info_Strategy,Tycoon,Management,Business / Tycoon","Other Info_Strategy,Tycoon,Management,Tycoon,Business / Tycoon","Other Info_Strategy,Tycoon,Management,Tycoon,Government","Other Info_Strategy,Wargame,Turn-Based","Other Info_Traditional,Sports,Team,Baseball,Sim","Other Info_Tycoon,Business / Tycoon,Strategy,Management,Government","Other Info_Tycoon,Strategy,Management,Tycoon,Government,Business / Tycoon","Other Info_Virtual Life,Simulation,Miscellaneous,Virtual,Virtual Life"
0,2015,0,2015,31,77,88,6.9,0.09,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2008,0,2008,42,65,59,6.6,1.14,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2008,0,2008,18,70,64,6.1,0.03,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2014,0,2014,13,72,16,7.1,0.02,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,2016,2,2014,10,82,8,4.5,0.11,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Split our preprocessed data into our features and target arrays
y = video_games_df["Critic_Score"].values
X = video_games_df.drop(["Critic_Score"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### NEURAL NET

In [22]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="sigmoid")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="relu"))

# Check the structure of the model
nn.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 8)                 7016      
                                                                 
 dense_8 (Dense)             (None, 5)                 45        
                                                                 
 dense_9 (Dense)             (None, 1)                 6         
                                                                 
Total params: 7,067
Trainable params: 7,067
Non-trainable params: 0
_________________________________________________________________


In [27]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer="adam", metrics=["accuracy"])

In [28]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=25)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [14]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3/3 - 0s - loss: -1.2278e+06 - accuracy: 0.0000e+00 - 145ms/epoch - 48ms/step
Loss: -1227759.625, Accuracy: 0.0


### RANDOM FOREST 

In [15]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.117
