In [174]:
# Load the Cleaned CSV file 
file_to_load = '../resources/Popular_Tags.csv'

In [175]:
# Import our Dependencies 
import pandas as pd 
import numpy as np 

In [176]:
# Load the CSV and view the first 5 rows
popular_tags = pd.read_csv(file_to_load)

popular_tags.head()

Unnamed: 0,name,percent_positive_reviews,popular_tags,Tag_1980s,Tag_1990s,Tag_2_5D,Tag_2D,Tag_2D_Fighter,Tag_3D,Tag_3D_Platformer,...,Video_Production,Violent,Visual_Novel,Voxel,Walking_Simulator,War,Wargame,Web_Publishing,World_War_II,Zombies
0,COUNTER-STRIKE,97.0,"Action,FPS,Multiplayer,Shooter,Classic,Team-Ba...",1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TEAM FORTRESS CLASSIC,84.0,"Action,FPS,Multiplayer,Classic,Shooter,Team-Ba...",0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,DAY OF DEFEAT,90.0,"FPS,World War II,Multiplayer,Shooter,Action,Wa...",0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,DEATHMATCH CLASSIC,83.0,"Action,FPS,Classic,Multiplayer,Shooter,First-P...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HALF-LIFE: OPPOSING FORCE,95.0,"FPS,Action,Classic,Sci-fi,Singleplayer,Shooter...",0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## For this dataset, I decided to do the classification model. I then used the pd.cuts method to see how best to classify the website_rating column. After doing a value.counts I noticed that the data was significantly skewed. I then resorted to using qcuts instead, in which pandas automatically cut the data into an even amount of quantiles. The commented cells below show the method I was trying with the regular cuts, and after that I used qcuts. 

In [177]:
# Define bins as 0 to 80 for unpopular, 80 - 100 as popular
bins = [0,75,100]

In [178]:
# 0 = unpopular, 1 = popular
group_names = [0,1]

In [179]:
# Cut the data based on the percent_positive_reviews and bin them into a column named classification
popular_tags['classification'] = pd.cut(popular_tags['percent_positive_reviews'], bins, labels = group_names)

In [180]:
popular_tags.head()

Unnamed: 0,name,percent_positive_reviews,popular_tags,Tag_1980s,Tag_1990s,Tag_2_5D,Tag_2D,Tag_2D_Fighter,Tag_3D,Tag_3D_Platformer,...,Violent,Visual_Novel,Voxel,Walking_Simulator,War,Wargame,Web_Publishing,World_War_II,Zombies,classification
0,COUNTER-STRIKE,97.0,"Action,FPS,Multiplayer,Shooter,Classic,Team-Ba...",1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,TEAM FORTRESS CLASSIC,84.0,"Action,FPS,Multiplayer,Classic,Shooter,Team-Ba...",0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,DAY OF DEFEAT,90.0,"FPS,World War II,Multiplayer,Shooter,Action,Wa...",0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
3,DEATHMATCH CLASSIC,83.0,"Action,FPS,Classic,Multiplayer,Shooter,First-P...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,HALF-LIFE: OPPOSING FORCE,95.0,"FPS,Action,Classic,Sci-fi,Singleplayer,Shooter...",0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [181]:
# 1989 games were classified as unpopular and 4224 games were classified as popular
popular_tags['classification'].value_counts()

1    4224
0    1989
Name: classification, dtype: int64

In [182]:
# drop the columns that have strings
popular_tags = popular_tags.drop(['name', 'percent_positive_reviews', 'popular_tags'], axis = 1)

In [183]:
# Print the dataframe
popular_tags.head()

Unnamed: 0,Tag_1980s,Tag_1990s,Tag_2_5D,Tag_2D,Tag_2D_Fighter,Tag_3D,Tag_3D_Platformer,Tag_4_Player_Local,Tag_4X,Abstract,...,Violent,Visual_Novel,Voxel,Walking_Simulator,War,Wargame,Web_Publishing,World_War_II,Zombies,classification
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [184]:
## Separate the Features (X) from the Target (y)

In [185]:
y = popular_tags['classification']
X = popular_tags.drop(columns='classification')

In [186]:
y.value_counts()

1    4224
0    1989
Name: classification, dtype: int64

In [187]:
 ## Split our data into training and testing

In [188]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(4659, 224)

In [189]:
 ## Create a Logistic Regression Model

In [190]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [191]:
 ## Fit (train) or model using the training data

In [192]:
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

In [193]:
 ## Make predictions

In [194]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,1
1,1,1
2,1,1
3,1,0
4,1,1
5,0,1
6,0,1
7,0,1
8,1,1
9,1,0


In [195]:
# Predict Accuracy Score
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7265122265122265


## The logistic regression model gave us an accuracy score of about 73%. 