In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import graphviz
import seaborn as sns

from sklearn import svm 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer

In [2]:
dataset = pd.read_csv("./appstore_games.csv")
dataset = dataset.drop(["URL", "Icon URL", "Subtitle", "ID", "Age Rating"], axis=1)
dataset["Name"] = dataset["Name"].fillna("None")
dataset["In-app Purchases"] = dataset["In-app Purchases"].fillna(0)
dataset["Average User Rating"] = dataset["Average User Rating"].fillna(0)
dataset["User Rating Count"] = dataset["User Rating Count"].fillna(0)
dataset["Price"] = dataset["Price"].fillna(0)
dataset["Languages"] = dataset["Languages"].fillna(0)
dataset["Size"] = dataset["Size"].fillna(0)
dataset.head()

Unnamed: 0,Name,Average User Rating,User Rating Count,Price,In-app Purchases,Description,Developer,Languages,Size,Primary Genre,Genres,Original Release Date,Current Version Release Date
0,Sudoku,4.0,3553.0,2.99,0,"Join over 21,000,000 of our fans and download ...",Mighty Mighty Good Games,"DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...",15853568.0,Games,"Games, Strategy, Puzzle",11/07/2008,30/05/2017
1,Reversi,3.5,284.0,1.99,0,"The classic game of Reversi, also known as Oth...",Kiss The Machine,EN,12328960.0,Games,"Games, Strategy, Board",11/07/2008,17/05/2018
2,Morocco,3.0,8376.0,0.0,0,Play the classic strategy game Othello (also k...,Bayou Games,EN,674816.0,Games,"Games, Board, Strategy",11/07/2008,5/09/2017
3,Sudoku (Free),3.5,190394.0,0.0,0,"Top 100 free app for over a year.\nRated ""Best...",Mighty Mighty Good Games,"DA, NL, EN, FI, FR, DE, IT, JA, KO, NB, PL, PT...",21552128.0,Games,"Games, Strategy, Puzzle",23/07/2008,30/05/2017
4,Senet Deluxe,3.5,28.0,2.99,0,"""Senet Deluxe - The Ancient Game of Life and A...",RoGame Software,"DA, NL, EN, FR, DE, EL, IT, JA, KO, NO, PT, RU...",34689024.0,Games,"Games, Strategy, Board, Education",18/07/2008,22/07/2018


In [3]:
# separate out independent and dependent features
independent_data = dataset.loc[:, ["Price", "Average User Rating", "Name", "Description", "Developer", "Size", "Primary Genre"]]
X = independent_data.values
y = dataset["User Rating Count"].values

# separate out string value columns
numerical_features = independent_data.dtypes == "float64"
categorical_features = ~numerical_features

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

preprocess = make_column_transformer(
    (OneHotEncoder(), categorical_features),
    (make_pipeline(SimpleImputer(), StandardScaler()), numerical_features)
)

In [4]:
dataset.isnull().any()

Name                            False
Average User Rating             False
User Rating Count               False
Price                           False
In-app Purchases                False
Description                     False
Developer                       False
Languages                       False
Size                            False
Primary Genre                   False
Genres                          False
Original Release Date           False
Current Version Release Date    False
dtype: bool

In [None]:
# define the svm classifier
classifier = RandomForestClassifier(**{'n_jobs': -1, 'n_estimators': 800})

# hook up the preprocess step with the classifier params to create the pipeline
model = make_pipeline(preprocess, classifier)

# fit the model to the training set
model.fit(X_train, y_train)

print("Training set Score: ", model.score(X_train, y_train))
print("Testing set Score: ", model.score(X_train, y_train))

Training set Score:  1.0
