In [83]:
# Import our dependencies
import pandas as pd
import sklearn as skl
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [69]:
# Read in our ramen data
video_games_df = pd.read_csv("test_table_5.csv")
video_games_df = video_games_df.dropna()
video_games_df.head()

Unnamed: 0,title,platform,release-date,score,user_score,developer,genre,critics,users,franchise,global_sales
0,.hack//G.U. Last Recode,PlayStation4,3-Nov-17,76,8.1,CyberConnect2,"Miscellaneous,Compilation",21,64,hack,0.47
2,#IDARB,XboxOne,30-Jan-15,77,6.9,OtherOceanInteractive,"Action,General",31,88,#IDARB,0.09
3,007: Quantum of Solace,PlayStation3,4-Nov-08,65,6.6,Treyarch,"Action,General,Shooter,Shooter,First-Person,Mo...",42,59,007: Quantum of Solace,1.14
4,007: Quantum of Solace,PC,4-Nov-08,70,6.1,"Treyarch,Beenox","Action,General,Shooter,Shooter,First-Person,Mo...",18,64,007: Quantum of Solace,0.03
6,10 Second Ninja,PC,5-Mar-14,72,7.1,GameDesignDan,"Action,Platformer,2D",13,16,10 Second Ninja,0.02


In [70]:
# Print out value counts for columns with many unique values
developer_counts = video_games_df.developer.value_counts()
developer_counts

Codemasters                     33
UbisoftMontreal                 31
EASports                        26
EACanada                        24
Capcom                          22
                                ..
Aspyr                            1
EABrightLight                    1
IRGurus                          1
IoInteractive,NixxesSoftware     1
BlueFangGames                    1
Name: developer, Length: 411, dtype: int64

In [71]:
# Determine which values to replace
replace_devs = list(developer_counts[developer_counts < 20].index)

# Replace in DataFrame
for devs in replace_devs:
    video_games_df.developer = video_games_df.developer.replace(devs,"Other")


# Check to make sure binning was successful
video_games_df.developer.value_counts()

Other              1130
Codemasters          33
UbisoftMontreal      31
EASports             26
EACanada             24
Capcom               22
VisualConcepts       21
Name: developer, dtype: int64

In [72]:
# Generate our categorical variable lists
game_cat = video_games_df.dtypes[video_games_df.dtypes == "object"].index.tolist()
game_cat

['title', 'platform', 'release-date', 'developer', 'genre', 'franchise']

In [73]:
# Check the number of unique values in each column
video_games_df[game_cat].nunique()

title           951
platform          4
release-date    612
developer         7
genre           398
franchise       424
dtype: int64

In [74]:
# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(video_games_df[game_cat]))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(game_cat)
encode_df.head()



Unnamed: 0,title_#IDARB,title_.hack//G.U. Last Recode,title_007: Quantum of Solace,title_10 Second Ninja,title_10 Second Ninja X,title_1701 A.D.,title_2010 FIFA World Cup South Africa,title_2014 FIFA World Cup Brazil,title_3D Dot Game Heroes,title_50 Cent: Blood on the Sand,...,franchise_World of Warcraft,franchise_Worldwide Soccer Manager,franchise_Worms,franchise_X,franchise_XCOM,franchise_Yakuza,franchise_You Don't Know Jack,franchise_Zoo Tycoon,franchise_echochrome,franchise_hack
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
# Merge one-hot encoded features and drop the originals
video_games_df = video_games_df.merge(encode_df,left_index=True, right_index=True)
video_games_df = video_games_df.drop(game_cat,1)
video_games_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,score,user_score,critics,users,global_sales,title_#IDARB,title_.hack//G.U. Last Recode,title_007: Quantum of Solace,title_10 Second Ninja,title_10 Second Ninja X,...,franchise_World of Warcraft,franchise_Worldwide Soccer Manager,franchise_Worms,franchise_X,franchise_XCOM,franchise_Yakuza,franchise_You Don't Know Jack,franchise_Zoo Tycoon,franchise_echochrome,franchise_hack
0,76,8.1,21,64,0.47,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,77,6.9,31,88,0.09,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,65,6.6,42,59,1.14,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,70,6.1,18,64,0.03,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,72,7.1,13,16,0.02,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
# Split our preprocessed data into our features and target arrays
y = video_games_df["score"].values
X = video_games_df.drop(["score"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

  This is separate from the ipykernel package so we can avoid doing imports until


In [77]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

### NEURAL NET

In [78]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 8)                 19208     
                                                                 
 dense_10 (Dense)            (None, 5)                 45        
                                                                 
 dense_11 (Dense)            (None, 1)                 6         
                                                                 
Total params: 19,259
Trainable params: 19,259
Non-trainable params: 0
_________________________________________________________________


In [79]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer="adam", metrics=["accuracy"])

In [80]:
# Train the model
fit_model = nn.fit(X_train,y_train,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [81]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

3/3 - 0s - loss: -1.7235e+05 - accuracy: 0.0000e+00 - 103ms/epoch - 34ms/step
Loss: -172350.171875, Accuracy: 0.0


### RANDOM FOREST 

In [84]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.147
