In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings(action='ignore')

In [4]:
train_df = pd.read_csv('Video_games_esrb_rating.csv')
test_df = pd.read_csv('test_esrb.csv')

In [5]:
train_df

Unnamed: 0,title,console,alcohol_reference,animated_blood,blood,blood_and_gore,cartoon_violence,crude_humor,drug_reference,fantasy_violence,intense_violence,language,lyrics,mature_humor,mild_blood,mild_cartoon_violence,mild_fantasy_violence,mild_language,mild_lyrics,mild_suggestive_themes,mild_violence,no_descriptors,nudity,partial_nudity,sexual_content,sexual_themes,simulated_gambling,strong_janguage,strong_sexual_content,suggestive_themes,use_of_alcohol,use_of_drugs_and_alcohol,violence,esrb_rating
0,Monster Jam Steel Titans 2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,E
1,Subnautica: Below Zero,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,ET
2,NIER REPLICANT VER.1.22474487139…,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,M
3,Jamestown+,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,ET
4,Neptunia Virtual Stars,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,T
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1890,SENRAN KAGURA Peach Beach Splash,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,M
1891,Sneaky Bears,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,T
1892,SPARC,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,E
1893,Still Time,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,T


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1895 entries, 0 to 1894
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   title                     1895 non-null   object
 1   console                   1895 non-null   int64 
 2   alcohol_reference         1895 non-null   int64 
 3   animated_blood            1895 non-null   int64 
 4   blood                     1895 non-null   int64 
 5   blood_and_gore            1895 non-null   int64 
 6   cartoon_violence          1895 non-null   int64 
 7   crude_humor               1895 non-null   int64 
 8   drug_reference            1895 non-null   int64 
 9   fantasy_violence          1895 non-null   int64 
 10  intense_violence          1895 non-null   int64 
 11  language                  1895 non-null   int64 
 12  lyrics                    1895 non-null   int64 
 13  mature_humor              1895 non-null   int64 
 14  mild_blood              

In [7]:
train_df.isnull().sum()

title                       0
console                     0
alcohol_reference           0
animated_blood              0
blood                       0
blood_and_gore              0
cartoon_violence            0
crude_humor                 0
drug_reference              0
fantasy_violence            0
intense_violence            0
language                    0
lyrics                      0
mature_humor                0
mild_blood                  0
mild_cartoon_violence       0
mild_fantasy_violence       0
mild_language               0
mild_lyrics                 0
mild_suggestive_themes      0
mild_violence               0
no_descriptors              0
nudity                      0
partial_nudity              0
sexual_content              0
sexual_themes               0
simulated_gambling          0
strong_janguage             0
strong_sexual_content       0
suggestive_themes           0
use_of_alcohol              0
use_of_drugs_and_alcohol    0
violence                    0
esrb_ratin

In [8]:
# Preprocessing

In [9]:
def preprocess_inputs(df, scaler, split='train'):
    df = df.copy()
    
    # Drop title column
    df = df.drop('title', axis=1)
    
    # Shuffle the data
    df = df.sample(frac=1.0, random_state=1).reset_index(drop=True)
    
    # Split df into X and y
    y = df['esrb_rating']
    X = df.drop('esrb_rating', axis=1)
    
    # Scale X
    if split == 'train':
        scaler.fit(X)
    X = pd.DataFrame(scaler.transform(X), index=X.index, columns=X.columns)
    
    return X, y

In [10]:
scaler = StandardScaler()

X_train, y_train = preprocess_inputs(train_df, scaler=scaler, split='train')
X_test, y_test = preprocess_inputs(test_df, scaler=scaler, split='test')

In [11]:
X_train

Unnamed: 0,console,alcohol_reference,animated_blood,blood,blood_and_gore,cartoon_violence,crude_humor,drug_reference,fantasy_violence,intense_violence,language,lyrics,mature_humor,mild_blood,mild_cartoon_violence,mild_fantasy_violence,mild_language,mild_lyrics,mild_suggestive_themes,mild_violence,no_descriptors,nudity,partial_nudity,sexual_content,sexual_themes,simulated_gambling,strong_janguage,strong_sexual_content,suggestive_themes,use_of_alcohol,use_of_drugs_and_alcohol,violence
0,1.050342,-0.229734,-0.100638,-0.543400,2.632275,-0.141117,-0.239745,-0.189961,-0.531983,-0.36613,-0.348284,-0.185442,-0.108378,-0.273628,-0.155963,-0.224596,-0.148709,-0.288923,-0.215368,-0.223297,-0.452443,-0.122464,-0.115624,-0.188465,-0.247043,-0.268016,-0.366130,-0.192924,-0.365203,-0.12683,-0.126830,-0.261166
1,1.050342,-0.229734,-0.100638,-0.543400,-0.379900,-0.141117,4.171098,-0.189961,-0.531983,-0.36613,-0.348284,-0.185442,-0.108378,-0.273628,-0.155963,-0.224596,-0.148709,-0.288923,-0.215368,-0.223297,-0.452443,-0.122464,-0.115624,-0.188465,-0.247043,3.731120,-0.366130,-0.192924,-0.365203,-0.12683,-0.126830,-0.261166
2,-0.952071,-0.229734,-0.100638,1.840265,-0.379900,-0.141117,-0.239745,-0.189961,-0.531983,-0.36613,-0.348284,-0.185442,-0.108378,-0.273628,-0.155963,-0.224596,-0.148709,-0.288923,-0.215368,-0.223297,-0.452443,-0.122464,-0.115624,-0.188465,-0.247043,-0.268016,-0.366130,-0.192924,-0.365203,-0.12683,-0.126830,-0.261166
3,1.050342,-0.229734,-0.100638,-0.543400,-0.379900,-0.141117,-0.239745,-0.189961,-0.531983,-0.36613,-0.348284,-0.185442,-0.108378,-0.273628,-0.155963,-0.224596,-0.148709,-0.288923,-0.215368,-0.223297,-0.452443,-0.122464,-0.115624,-0.188465,-0.247043,-0.268016,-0.366130,-0.192924,-0.365203,-0.12683,-0.126830,3.828989
4,-0.952071,-0.229734,-0.100638,-0.543400,2.632275,-0.141117,-0.239745,-0.189961,-0.531983,-0.36613,-0.348284,-0.185442,-0.108378,-0.273628,-0.155963,-0.224596,-0.148709,-0.288923,-0.215368,4.478343,-0.452443,-0.122464,-0.115624,5.306020,-0.247043,-0.268016,-0.366130,-0.192924,-0.365203,-0.12683,-0.126830,3.828989
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1890,-0.952071,-0.229734,-0.100638,1.840265,-0.379900,-0.141117,-0.239745,-0.189961,-0.531983,-0.36613,-0.348284,-0.185442,-0.108378,-0.273628,-0.155963,-0.224596,-0.148709,-0.288923,-0.215368,-0.223297,-0.452443,-0.122464,-0.115624,-0.188465,-0.247043,-0.268016,2.731267,-0.192924,-0.365203,-0.12683,-0.126830,-0.261166
1891,1.050342,-0.229734,-0.100638,-0.543400,-0.379900,-0.141117,-0.239745,-0.189961,-0.531983,-0.36613,2.871220,-0.185442,-0.108378,-0.273628,-0.155963,-0.224596,-0.148709,-0.288923,-0.215368,-0.223297,-0.452443,-0.122464,-0.115624,-0.188465,-0.247043,-0.268016,-0.366130,-0.192924,-0.365203,-0.12683,-0.126830,-0.261166
1892,1.050342,-0.229734,-0.100638,-0.543400,-0.379900,-0.141117,-0.239745,-0.189961,-0.531983,-0.36613,-0.348284,-0.185442,-0.108378,-0.273628,-0.155963,-0.224596,-0.148709,-0.288923,-0.215368,-0.223297,2.210225,-0.122464,-0.115624,-0.188465,-0.247043,-0.268016,-0.366130,-0.192924,-0.365203,-0.12683,-0.126830,-0.261166
1893,-0.952071,-0.229734,-0.100638,-0.543400,-0.379900,-0.141117,-0.239745,-0.189961,1.879759,-0.36613,-0.348284,-0.185442,-0.108378,-0.273628,-0.155963,-0.224596,-0.148709,-0.288923,-0.215368,-0.223297,-0.452443,-0.122464,-0.115624,-0.188465,-0.247043,-0.268016,-0.366130,-0.192924,-0.365203,-0.12683,7.884584,-0.261166


In [12]:
X_train.shape

(1895, 32)

In [13]:
y_train

0        M
1        T
2        T
3        T
4        T
        ..
1890     M
1891     T
1892     E
1893    ET
1894     M
Name: esrb_rating, Length: 1895, dtype: object

In [14]:
# Training

In [18]:
from sklearn.preprocessing import LabelEncoder
# Create a label encoder
label_encoder = LabelEncoder()

# Fit and transform the target variable for the training set
y_train_encoded = label_encoder.fit_transform(y_train)

In [19]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier(),
    "                               XGBoost": XGBClassifier(eval_metric='mlogloss'),
    "                              LightGBM": LGBMClassifier(),
    "                              CatBoost": CatBoostClassifier(verbose=0)
}

for name, model in models.items():
    model.fit(X_train, y_train_encoded)
    print(name + " trained.")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
                              LightGBM trained.
                              CatBoost trained.


In [20]:
# Result

In [26]:
for name, model in models.items():
    print(name + " Accuracy: {:.3f}%".format(model.score(X_train, y_train_encoded) * 100))

                   Logistic Regression Accuracy: 86.438%
                   K-Nearest Neighbors Accuracy: 85.224%
                         Decision Tree Accuracy: 92.454%
Support Vector Machine (Linear Kernel) Accuracy: 85.541%
   Support Vector Machine (RBF Kernel) Accuracy: 90.607%
                        Neural Network Accuracy: 92.454%
                         Random Forest Accuracy: 92.454%
                     Gradient Boosting Accuracy: 88.179%
                               XGBoost Accuracy: 91.979%
                              LightGBM Accuracy: 91.926%
                              CatBoost Accuracy: 92.243%
