# Tarea 4

In [1]:
import pandas as pd
import numpy as np

from dotenv import load_dotenv
import os

In [2]:
load_dotenv()
smm2_df = pd.read_csv(os.getenv("DATASET_STORE_FILE"))
columns_to_drop = ['data_id', 'name', 'description', 'gamestyle', 'theme', 'tag1', 'tag2', 'clear_condition', 'clear_condition_magnitude', 'weekly_likes', 'weekly_plays', 'uploader_pid', 'first_completer_pid', 'record_holder_pid']
smm2_df_clean = smm2_df.drop(columns_to_drop, axis=1)
smm2_df_clean['unique_clear_rate'] = np.minimum(smm2_df_clean['clears'] / smm2_df_clean['unique_players_and_versus'], 1.0)
smm2_df_clean.head(5)

Unnamed: 0,uploaded,created,difficulty,game_version,world_record,upload_time,upload_attempts,num_comments,timer,autoscroll_speed,clears,attempts,clear_rate,plays,versus_matches,coop_matches,likes,boos,unique_players_and_versus,unique_clear_rate
0,1621061069,1621092960,2,5,15283,15266,1,0,20,0,23,396,5.808081,45,0,0,3,2,38,0.605263
1,1621077412,1621080840,1,5,47217,62592,1,0,500,0,16,59,27.118644,26,0,0,1,1,24,0.666667
2,1621051239,1621033080,1,5,53933,57333,1,0,200,0,1,12,8.333333,4,0,0,0,0,4,0.25
3,1621064079,1620710520,1,5,223387,198789,1,1,500,0,6,59,10.169492,22,0,0,1,0,21,0.285714
4,1621059182,1620229080,0,5,20533,32690,3,2,300,0,84,160,52.5,95,2,0,13,4,86,0.976744


## Filtración

El articulo *What Makes a Level Hard in Super Mario Maker 2? (Carlo A. Furia, Andrea Mocci)* utiliza regresión lineal para filtrar las variables importantes dentro del dataset. Pero en su articulo ellos tratan de predecir el *clear_rate* (variable continua), mientras que yo trato de predecir *difficulty* (variable categórica). 

In [3]:
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif

In [4]:
smm2_x = smm2_df_clean.drop(columns=['difficulty'], axis=1)
smm2_y = smm2_df_clean['difficulty']

In [5]:
X = smm2_x
y = smm2_y

selector_f = SelectKBest(score_func=f_classif, k=10)
X_new_f = selector_f.fit_transform(X, y)

scores_f = selector_f.scores_
mask_f = selector_f.get_support()

df_scores_f = pd.DataFrame({
    'feature': X.columns,
    'f_score': scores_f,
    'selected': mask_f,
}).sort_values('f_score', ascending=False)
df_scores_f

Unnamed: 0,feature,f_score,selected
11,clear_rate,7044.98925,True
18,unique_clear_rate,5186.731661,True
5,upload_attempts,834.32261,True
3,world_record,203.638779,True
4,upload_time,168.53213,True
7,timer,63.222419,True
10,attempts,19.326281,True
9,clears,8.572547,True
16,boos,7.234048,True
12,plays,3.621015,True


In [6]:
X = smm2_x
y = smm2_y

selector_mi = SelectKBest(score_func=mutual_info_classif, k=10)
X_new_mi = selector_mi.fit_transform(X, y)

scores_mi = selector_mi.scores_
mask_mi = selector_mi.get_support()

df_scores_mi = pd.DataFrame({
    'feature': X.columns,
    'mi_score': scores_mi,
    'selected': mask_mi,
}).sort_values('mi_score', ascending=False)
df_scores_mi

Unnamed: 0,feature,mi_score,selected
11,clear_rate,0.728994,True
18,unique_clear_rate,0.475217,True
5,upload_attempts,0.192704,True
3,world_record,0.151157,True
9,clears,0.144036,True
10,attempts,0.082553,True
13,versus_matches,0.064142,True
4,upload_time,0.033742,True
17,unique_players_and_versus,0.032349,True
16,boos,0.030715,True


In [7]:
seltectec_col_f = smm2_x.columns[selector_f.get_support()]
seltectec_col_mi = smm2_x.columns[selector_mi.get_support()]

print(f'Diferencias de Selección: {set(seltectec_col_f) ^ set(seltectec_col_mi)}')
print(f'En ANOVA F pero no en Mutual Information: {list(set(seltectec_col_f) - set(seltectec_col_mi))}')
print(f'En Mutual Information pero no en ANOVA F: {list(set(seltectec_col_mi) - set(seltectec_col_f))}')

Diferencias de Selección: {'plays', 'unique_players_and_versus', 'versus_matches', 'timer'}
En ANOVA F pero no en Mutual Information: ['plays', 'timer']
En Mutual Information pero no en ANOVA F: ['unique_players_and_versus', 'versus_matches']


## Métodos de envoltura

La regression logistica es el mejor estimador para las variables dependientes categoricas, pero los datos deben de estar estandarizados para que el estimador funcione correctamente.

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import ExhaustiveFeatureSelector, SequentialFeatureSelector

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(smm2_x[seltectec_col_f])
X = pd.DataFrame(X_scaled, columns=smm2_x[seltectec_col_f].columns)
y = smm2_y

estimator = LogisticRegression(max_iter=2000)
estimator


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000


In [10]:
efs = ExhaustiveFeatureSelector(estimator=estimator, min_features=1, max_features=2, scoring='neg_mean_absolute_error', cv=5)
efs = efs.fit(X, y)

efs_result = pd.DataFrame(efs.get_metric_dict()).T
efs_result = efs_result.sort_values('avg_score', ascending = False)
efs_result

Features: 55/55

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
30,"(2, 6)","[-0.191, -0.1795, -0.1725, -0.191, -0.1805]",-0.1829,"(upload_attempts, clear_rate)",0.009209,0.007165,0.003583
36,"(3, 6)","[-0.197, -0.1865, -0.1785, -0.2005, -0.1845]",-0.1894,"(timer, clear_rate)",0.010477,0.008151,0.004076
49,"(6, 7)","[-0.1965, -0.1875, -0.1825, -0.2005, -0.187]",-0.1908,"(clear_rate, plays)",0.008541,0.006645,0.003323
45,"(5, 6)","[-0.1965, -0.1855, -0.183, -0.202, -0.187]",-0.1908,"(attempts, clear_rate)",0.009292,0.007229,0.003615
50,"(6, 8)","[-0.1955, -0.187, -0.183, -0.203, -0.1875]",-0.1912,"(clear_rate, boos)",0.009202,0.00716,0.00358
6,"(6,)","[-0.1965, -0.188, -0.183, -0.201, -0.1875]",-0.1912,"(clear_rate,)",0.008434,0.006562,0.003281
15,"(0, 6)","[-0.1955, -0.193, -0.1815, -0.2015, -0.185]",-0.1913,"(world_record, clear_rate)",0.009274,0.007215,0.003608
23,"(1, 6)","[-0.1965, -0.1895, -0.185, -0.2005, -0.186]",-0.1915,"(upload_time, clear_rate)",0.007765,0.006042,0.003021
41,"(4, 6)","[-0.1965, -0.189, -0.1835, -0.201, -0.1875]",-0.1915,"(clears, clear_rate)",0.008159,0.006348,0.003174
51,"(6, 9)","[-0.205, -0.1965, -0.1985, -0.2025, -0.191]",-0.1987,"(clear_rate, unique_clear_rate)",0.006252,0.004864,0.002432


In [11]:
sfs = SequentialFeatureSelector(estimator=estimator, k_features=(1, 5), forward=True, scoring='neg_mean_absolute_error', cv=5)
sfs = sfs.fit(X, y)

sfs_result = pd.DataFrame(sfs.get_metric_dict()).T
sfs_result = sfs_result.sort_values('avg_score', ascending = False)
sfs_result

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names,ci_bound,std_dev,std_err
5,"(2, 3, 4, 6, 7)","[-0.187, -0.176, -0.169, -0.1875, -0.1765]",-0.1792,"(upload_attempts, timer, clears, clear_rate, p...",0.009112,0.007089,0.003545
3,"(2, 3, 6)","[-0.187, -0.1775, -0.1715, -0.1865, -0.176]",-0.1797,"(upload_attempts, timer, clear_rate)",0.007824,0.006088,0.003044
4,"(2, 3, 6, 7)","[-0.1875, -0.1765, -0.171, -0.188, -0.1765]",-0.1799,"(upload_attempts, timer, clear_rate, plays)",0.008635,0.006719,0.003359
2,"(2, 6)","[-0.191, -0.1795, -0.1725, -0.191, -0.1805]",-0.1829,"(upload_attempts, clear_rate)",0.009209,0.007165,0.003583
1,"(6,)","[-0.1965, -0.188, -0.183, -0.201, -0.1875]",-0.1912,"(clear_rate,)",0.008434,0.006562,0.003281
