## Esse arquivo é dedicado aos testes com os dados agrupados

#### O arquivo utilizado para apresentar a regressão agrupada é grouped-regression

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Carrega o dataset como dataframe
df = pd.read_csv('dataset/video_games_2016.csv')

In [None]:
# Remove outliers
Q1 = df['Global_Sales'].quantile(0.00)
Q3 = df['Global_Sales'].quantile(0.98)

df = df[(df['Global_Sales'] >= Q1 ) & (df['Global_Sales'] <= Q3 )]

In [None]:
# Cria coluna com a região que mais vendeu
column_to_number = {
    'NA_Sales':    "NA",
    'EU_Sales':    "EU",
    'JP_Sales':    "JP",
    'Other_Sales': "Outro" 
}

df['Greatest'] = df[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].idxmax(axis=1).map(column_to_number)

In [None]:
print ( df["Critic_Score"].isna().sum() )
print ( df["Critic_Count"].isna().sum() )

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

#
# Regressões para preencher os NAs de
# Critic_Score e Critic_Count
#

fdf = df.copy()

train_data = fdf[fdf['Critic_Score'].notna()]
predict_data = fdf[fdf['Critic_Score'].isna()]

train_data_count   = fdf[fdf['Critic_Count'].notna()]
predict_data_count = fdf[fdf['Critic_Count'].isna()]

categoricos = [
    'Genre', 
    'Platform',
    'Greatest',
    'Publisher',
]

numericos = [
    'Global_Sales',
    'NA_Sales',
    'EU_Sales',
    'JP_Sales',
    'Other_Sales'
]

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categoricos),
        ('num', 'passthrough', numericos)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

X_train = train_data[categoricos + numericos]
y_train = train_data['Critic_Score']

model.fit(X_train, y_train)

X_predict = predict_data[categoricos + numericos]
predicted_scores = model.predict(X_predict)

print(r2_score( y_train, model.predict( X_train ) ))

X_train = train_data_count[categoricos + numericos]
y_train = train_data_count['Critic_Count']

model.fit(X_train, y_train)

X_predict = predict_data[categoricos + numericos]
predicted_count = model.predict(X_predict)

print(r2_score( y_train, model.predict( X_train ) ))

fdf.loc[fdf['Critic_Score'].isna(), 'Critic_Score'] = predicted_scores
fdf.loc[fdf['Critic_Count'].isna(), 'Critic_Count'] = predicted_count

In [None]:
# Isso foi usado nos slides a fim de demonstrar como
# Usar a media ou zero não fazia muito sentido

# fill_0 = [
#     "Critic_Score",
#     "Critic_Count",
#     "User_Score",
#     "User_Count",
# ]

# fill_mean = [
#     'Year_of_Release',
#     'Global_Sales'
# ]

# for c in fill_0 :
#     df[c] = df[c].fillna(0)

# for c in fill_mean :
#     df[c] = df[c].fillna(df[c].mean())


In [None]:
# Converte coluna de plataforma em um codigo numerico
df["Platform_Code"] = df[ "Platform" ].astype("category").cat.codes

In [None]:
fdf = df.copy()

def reduce_platforms( l ):
    s = 0;
    for i in l:
        s |= 1 << i+1
    return s

grouped = fdf.groupby( "Name" )

a = grouped.agg(

    Genre=("Genre", "first"),
    Developer=("Developer", "first"),
    # Greatest=("Gret", "first"),
    Publisher=("Publisher", "first"),
    Year_of_Release=("Year_of_Release", "first"),
    Rating=("Rating", "first"),

    Count_Platforms=("Name", "count"),

    # #####
    # Isto é um meio de reconhecer quais as plataforma o jogo
    # Está disponível

    # Acabou não dando muito resultado;
    Platform_Codes=("Platform_Code", reduce_platforms),

    Global_Sales=("Global_Sales", "sum"),
    NA_Sales=("NA_Sales", "sum"),
    EU_Sales=("EU_Sales", "sum"),
    JP_Sales=("JP_Sales", "sum"),
    Other_Sales=("Other_Sales", "sum"),

    True_Score=("Critic_Score", "max"), 
    User_Score=("User_Score", "max"), 

    User_Count=("User_Count", "sum"), 
    Critic_Count=("Critic_Count", "sum"), 
    # True_Score=("Critic_Score", lambda x: (x * grouped["Critic_Count"].get_group(x.name)).sum() / grouped["Critic_Count"].get_group(x.name).sum()), 
    # Critic_Score=("Critic_Score", lambda x: x)
).reset_index()

a.sort_values("Count_Platforms", ascending=False).head(4)

In [None]:
# Cria coluna com a região que mais vendeu
column_to_number = {
    'NA_Sales':    "NA",
    'EU_Sales':    "EU",
    'JP_Sales':    "JP",
    'Other_Sales': "Outro" 
}

a['Greatest'] = a[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales']].idxmax(axis=1).map(column_to_number)

In [None]:
fdf = a
fdf = fdf.groupby("Count_Platforms")["Global_Sales"].mean().reset_index()


plt.figure(figsize=(5, 4))
plt.scatter( 
  fdf['Count_Platforms'],
  fdf['Global_Sales'],

  cmap='viridis',
  alpha=.4
)

plt.ylabel('Vendas globais')
plt.xlabel('No de plataforma')

plt.title('Vendas X Plataformas suportadas')

plt.show()

In [None]:

fdf = a

min_sales = fdf.groupby('True_Score')['Global_Sales'].min().reset_index()

plt.figure(figsize=(5, 4))
plt.scatter( 
  min_sales['True_Score'],
  min_sales['Global_Sales'],

  cmap='viridis',
  alpha=.4
)

plt.ylabel('Vendas')
plt.xlabel('Contagem critica')

plt.title('Critic X Vendas')

plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
target = 'Global_Sales'

numeric_features = [ 
    "Count_Platforms",
    'True_Score',
    'Critic_Count',
    'Platform_Codes'
]  

categorical_features = [
    'Genre',
    'Year_of_Release',
    'Publisher',
    'Developer',
    'Rating'
]  

norm = lambda col: (col - col.min()) / ( col.max() - col.min() ) 

to_norm = [
    "Platform_Codes",
    "Critic_Count",
    "True_Score",
]

for col in to_norm:
    fdf[col] = norm(fdf[col])

fdf = a.copy()

fdf[categorical_features] = fdf[categorical_features].fillna('NA')  

X = fdf[numeric_features + categorical_features]
y = fdf[ target ]  
y = np.log1p(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', PolynomialFeatures(degree=3, include_bias=False, interaction_only=True) , numeric_features),
        # ('num', "passthrough" , numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

gdboost = GradientBoostingRegressor(
    random_state=0,
    learning_rate=0.05,
    max_depth=8,
    n_estimators=800,
    min_samples_split=15,
    min_samples_leaf=10 
)

rforest = RandomForestRegressor( 
    random_state=42,
    max_depth=9,
    n_estimators=800
)

model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', gdboost )
                                #  ('regressor', rforest)
                                #  ('regressor', LinearRegression())
                                #  ('regressor', Ridge())
                                ])


In [None]:
model_pipeline.fit(X_train, y_train)
y_pred= model_pipeline.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Calculate performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("R-squared:", r2)


In [None]:
plt.figure(figsize=(5, 4))
plt.scatter(X_test['True_Score'], y_test, alpha=0.2, label='Real')

plt.scatter(X_test["True_Score"], y_pred, color='red', alpha=0.2, label='Previsão')

plt.xlabel('Avaliação da Crítica')
plt.ylabel('Vendas globais')
plt.title('Regressão Polinomial')
plt.legend()
plt.show()

In [None]:
y_pred = model_pipeline.predict(X_test)

residuals = y_test - y_pred

x_axis_values = X_test['True_Score'] 

plt.figure(figsize=(8, 6))
plt.scatter(x_axis_values, residuals, alpha=0.5, color='blue', label='Residuals')
plt.axhline(y=0, color='red', linestyle='--', label='Zero Error Line')

plt.title('residuos')
plt.xlabel('critica')
plt.ylabel('..')
plt.legend()
plt.show()
