In [1]:

import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Carrega o dataset como dataframe
df = pd.read_csv('dataset/video_games_2016.csv')

In [3]:
# Remove outliers
Q1 = df['Global_Sales'].quantile(0.00)
Q3 = df['Global_Sales'].quantile(0.98)

df = df[(df['Global_Sales'] >= Q1 ) & (df['Global_Sales'] <= Q3 )]

In [4]:
# Cria coluna com a região que mais vendeu
column_to_number = {
    'NA_Sales':    "NA",
    'EU_Sales':    "EU",
    'JP_Sales':    "JP",
    'Other_Sales': "Outro" 
}

df['Greatest'] = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].idxmax(axis=1).map(column_to_number)

In [35]:
# Transforma colunas categóricas em valores numéricos
# necessários para utilização de árvore de decisão

df["Publisher"] = df["Publisher"].fillna("NA").astype('category').cat.codes
df["Platform"]  = df["Platform"].fillna("NA").astype('category').cat.codes
df["Genre"]     = df["Genre"].fillna("NA").astype('category').cat.codes
df["Developer"] = df["Developer"].fillna("NA").astype('category').cat.codes
# df["Greatest"]  = df["Greatest"].astype('category').cat.codes
df["Rating"]    = df["Rating"].astype('category').cat.codes

# Preenche os NAs com 0
df["Critic_Score"] = df["Critic_Score"].fillna(0)
df["Critic_Count"] = df["Critic_Count"].fillna(0)
df["JP_Sales"]     = df["JP_Sales"].fillna(0)

# df = df.dropna()

df['Critic_Classification'] = pd.cut(df['Critic_Score'], bins=10, labels=[x for x in range(0, 10)] )
df['Critic_Count_Classification'] = pd.cut(df['Critic_Count'], bins=4, labels=[1, 2, 3, 4])

# df.fillna("NA")
df["Year_of_Release"] = df["Year_of_Release"].fillna(2000);

print( df[["Critic_Score", "Critic_Classification"]] )


       Critic_Score Critic_Classification
335            85.0                     8
336            91.0                     9
337            88.0                     8
338            81.0                     8
339             0.0                     0
...             ...                   ...
16714           0.0                     0
16715           0.0                     0
16716           0.0                     0
16717           0.0                     0
16718           0.0                     0

[16384 rows x 2 columns]


In [36]:
# Arvore baseada em
X = np.array(df[[
    "Publisher",
    "Platform", 
    "Genre",
    "Developer",
    # "Rating",
    "Year_of_Release",
    "Critic_Classification"
]])

# Saída : NA | EU | JP | Outros
y = np.array(df['Greatest'])

In [37]:
from sklearn import tree
from sklearn.model_selection import train_test_split

# Dividindo os conjuntos de treinamento e teste
X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(X, y, test_size=0.3)

modeloarvoredecisao = tree.DecisionTreeClassifier(criterion='entropy', class_weight="balanced")     

In [38]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

gradient_boost = GradientBoostingClassifier(
    n_estimators=400,
    learning_rate=0.1,
    max_depth=3,
    random_state=0
)

gradient_boost.fit(X_treinamento, y_treinamento)
y_pred = gradient_boost.predict(X_teste) 

accuracy = accuracy_score(y_teste, y_pred)

print("Gradient Model Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_teste, y_pred))

Gradient Model Accuracy: 0.8342148087876322

Classification Report:
               precision    recall  f1-score   support

          EU       0.74      0.54      0.62       678
          JP       0.82      0.82      0.82      1196
          NA       0.86      0.91      0.88      3027
       Outro       0.31      0.33      0.32        15

    accuracy                           0.83      4916
   macro avg       0.68      0.65      0.66      4916
weighted avg       0.83      0.83      0.83      4916



In [39]:
modeloarvoredecisao.fit(X_treinamento, y_treinamento)
modeloarvoredecisao.score(X_treinamento, y_treinamento)

0.9843041506801534

In [40]:
from sklearn import metrics


y_predicao = modeloarvoredecisao.predict(X_teste)

print("Acurácia:",metrics.accuracy_score(y_teste, y_predicao))
print("Precisão:",metrics.precision_score(y_teste, y_predicao, average=None))
print("Recall:",metrics.recall_score(y_teste, y_predicao, average=None))
print("F1-score:",metrics.f1_score(y_teste, y_predicao, average=None))

Acurácia: 0.7711554109031733
Precisão: [0.55538695 0.74062251 0.83529017 0.30434783]
Recall: [0.53982301 0.77591973 0.82259663 0.46666667]
F1-score: [0.54749439 0.75786035 0.82889481 0.36842105]


In [None]:
labels = ["NA", "EU", "JP", "Outro"]

matrizconfusao = metrics.confusion_matrix(y_teste, y_predicao, labels=labels)

ax = plt.subplot()

sns.heatmap(matrizconfusao, annot=True, ax = ax);

# labels, title and ticks
ax.set_xlabel('Predição');
ax.set_ylabel('Valor Real'); 
ax.set_title('Matriz de Confusão'); 
ax.xaxis.set_ticklabels( labels ); 
ax.yaxis.set_ticklabels( labels );