In [1]:
# -- SQL_Connection to local Postgres SQL Database

# Resources for connection help:
# -- https://medium.com/analytics-vidhya/postgresql-integration-with-jupyter-notebook-deb97579a38d
# -- https://overiq.com/sqlalchemy-101/installing-sqlalchemy-and-connecting-to-database/

# Load ipython-sql to use and store SQL commands in notebook
%load_ext sql

In [2]:
# Import dependencies 
import pandas as pd 
import numpy as np 
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from path import Path 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import StandardScaler 

# Import connection string from config file
from config import db_password
from config import port_number 
from config import db_name

In [3]:
# Connect sqlalchemy to database using connection string

db_string = f"postgres://postgres:{db_password}@localhost:{port_number}/{db_name}"

# -- Alternate db_string for engine connection, update host and DB connection as needed. 
# db_string = f"postgres://postgres:{db_password}@127.0.0.1:54642/SteamDB"

engine = create_engine(db_string)

print(engine)

Engine(postgres://postgres:***@localhost:5432/steam_db)


In [4]:
# Create Game Genre dataframe from SQL table 

genre_df = pd.read_sql('SELECT * FROM gamegenres', engine)

genre_df.head()

Unnamed: 0,game_name,percent_positive_reviews,genre,action,adventure,casual,design_and_illustration,early_access,free_to_play,indie,massively_multiplayer,rpg,racing,simulation,sports,strategy,utilities
0,COUNTER-STRIKE,97.0,Action,True,False,False,False,False,False,False,False,False,False,False,False,False,False
1,TEAM FORTRESS CLASSIC,84.0,Action,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2,DAY OF DEFEAT,90.0,Action,True,False,False,False,False,False,False,False,False,False,False,False,False,False
3,DEATHMATCH CLASSIC,83.0,Action,True,False,False,False,False,False,False,False,False,False,False,False,False,False
4,HALF-LIFE: OPPOSING FORCE,95.0,Action,True,False,False,False,False,False,False,False,False,False,False,False,False,False


In [5]:
# Define bins as 0 to 80 for unpopular, 80 - 100 as popular
bins = [0,75,100]

In [6]:
# 0 = unpopular, 1 = popular
group_names = [0,1]

In [7]:
# Cut the data based on the percent_positive_reviews and bin them into a column named classification
genre_df['classification'] = pd.cut(genre_df['percent_positive_reviews'], bins, labels = group_names)

In [8]:
# 1977 games were classified as unpopular and 4210 games were classified as popular
genre_df['classification'].value_counts()

1    4210
0    1977
Name: classification, dtype: int64

In [9]:
#drop the columns that have strings
genre_df = genre_df.drop(['game_name', 'percent_positive_reviews', 'genre'], axis = 1)

In [10]:
## Separate the Features (X) from the Target (y)
y = genre_df['classification']
X = genre_df.drop(columns='classification')

In [11]:
 ## Split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(4640, 14)

In [12]:
## Create a Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [13]:
## Fit (train) or model using the training data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [14]:
 ## Make predictions
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,0
1,1,0
2,1,1
3,1,1
4,1,1
5,1,0
6,1,1
7,1,1
8,1,1
9,1,1


In [15]:
# Predict Accuracy Score
print(accuracy_score(y_test, y_pred))

0.7045895281189399


## The logistic regression model gave us an accuracy score of about 70%. 

In [16]:
#confusion_matrix
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[  64  430]
 [  27 1026]]


In [17]:
#classification_report
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.70      0.13      0.22       494
           1       0.70      0.97      0.82      1053

    accuracy                           0.70      1547
   macro avg       0.70      0.55      0.52      1547
weighted avg       0.70      0.70      0.63      1547



## According to the classification report, the precision of the machine learning model was roughly 70% for predicting both popular games and unpopular games via genre.

## The next test we decided to run was the Random Forest Model.

In [18]:
# Define the features set
X = genre_df.copy()
X = X.drop("classification", axis=1)
X.head()

Unnamed: 0,action,adventure,casual,design_and_illustration,early_access,free_to_play,indie,massively_multiplayer,rpg,racing,simulation,sports,strategy,utilities
0,True,False,False,False,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,False,False,False,False


In [19]:
# Define the target set
y = genre_df["classification"].values
y[:5]

[1, 1, 1, 1, 1]
Categories (2, int64): [0 < 1]

In [20]:
# Split the Data into training and testing Sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [21]:
# Determine the shape of our training and testing sets
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4640, 14)
(1547, 14)
(4640,)
(1547,)


In [22]:
# Splitting into train and test sets into an 80/20 split
X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y, random_state=78, train_size=0.80)

In [23]:
# Determine the shape of our training and testing sets
print(X_train2.shape)
print(X_test2.shape)
print(y_train2.shape)
print(y_test2.shape)


(4949, 14)
(1238, 14)
(4949,)
(1238,)


In [24]:
# Creating a StandardScaler instance
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [25]:
from sklearn import tree
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [26]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)
predictions

array([1, 1, 1, ..., 1, 1, 1])

In [27]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,111,381
Actual 1,121,934


In [28]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.6755009696186167

In [29]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,111,381
Actual 1,121,934


Accuracy Score : 0.6755009696186167
Classification Report
              precision    recall  f1-score   support

           0       0.48      0.23      0.31       492
           1       0.71      0.89      0.79      1055

    accuracy                           0.68      1547
   macro avg       0.59      0.56      0.55      1547
weighted avg       0.64      0.68      0.64      1547

