In [16]:
import pandas as pd

df = pd.read_csv('features.csv')
y = df['target']
print(df.head())
print(df.columns)

   age  Pstatus  Medu  Fedu  Mjob  Fjob  guardian  traveltime  studytime  \
0   18        0     4     4     0     4         1           2          2   
1   17        1     1     1     0     2         0           1          2   
2   15        1     1     1     0     2         1           1          2   
3   15        1     4     2     1     3         1           1          3   
4   16        1     3     3     2     2         0           1          2   

   failures  ...  freetime  goout  Dalc  Walc  health  absences  target  \
0         0  ...         3      4     1     1       3         4      11   
1         0  ...         3      3     1     1       3         2      11   
2         0  ...         3      2     2     3       3         6      12   
3         0  ...         2      2     1     1       5         0      14   
4         0  ...         3      2     1     2       5         0      13   

   is_male  urban_address  big_family  
0        0              1           1  
1        0  

In [17]:
df_features = df.copy()

df_features['parents_avg_edu'] = (df_features['Medu'] + df_features['Fedu']) / 2
    
    # Diferença educacional entre os pais
df_features['parents_edu_diff'] = abs(df_features['Medu'] - df_features['Fedu'])
    
    # Indicador de alta educação parental (ambos com educação superior)
df_features['high_parent_edu'] = ((df_features['Medu'] >= 3) & (df_features['Fedu'] >= 3)).astype(int)
    
    # Indicador de baixa educação parental (ambos com educação básica)
df_features['low_parent_edu'] = ((df_features['Medu'] <= 2) & (df_features['Fedu'] <= 2)).astype(int)
    
    # Score de suporte total (familiar + escolar)
df_features['total_support'] = df_features['schoolsup'] + df_features['famsup']
    
    # Score de recursos educacionais extras
df_features['extra_resources'] = df_features['paid'] + df_features['activities'] + df_features['nursery']
    
    # Score de consumo de álcool total (soma ponderada: fim de semana tem mais peso)
df_features['alcohol_total'] = df_features['Dalc'] + (df_features['Walc'] * 1.5)
    
    # Indicador de consumo problemático de álcool
df_features['alcohol_problem'] = ((df_features['Dalc'] >= 3) | (df_features['Walc'] >= 4)).astype(int)
    
    # Score de vida social (goout + freetime)
df_features['social_score'] = df_features['goout'] + df_features['freetime']
    
    # Indicador de alta vida social vs baixo tempo de estudo
df_features['social_vs_study'] = (df_features['goout'] >= 4) & (df_features['studytime'] <= 2)
df_features['social_vs_study'] = df_features['social_vs_study'].astype(int)
    
    # Indicador de estudante dedicado (alto tempo de estudo + baixas faltas)
df_features['dedicated_student'] = ((df_features['studytime'] >= 3) & (df_features['absences'] <= 5)).astype(int)
    
    # Indicador de estudante em risco (falhas + faltas + baixo tempo de estudo)
df_features['at_risk_student'] = ((df_features['failures'] >= 1) | 
                                    (df_features['absences'] >= 10) | 
                                    (df_features['studytime'] <= 1)).astype(int)
    
    # Score de comprometimento acadêmico
df_features['academic_commitment'] = (df_features['studytime'] * 2 + 
                                        df_features['higher'] * 3 + 
                                        (5 - df_features['absences']/10))  # Penaliza ausências

    # Score de estabilidade familiar
df_features['family_stability'] = (df_features['Pstatus'] +  # Pais juntos
                                     df_features['famrel'] +     # Qualidade das relações
                                     (1 - df_features['big_family']))  # Família menor = mais estável
    
    # Score de acesso a recursos
df_features['resource_access'] = (df_features['internet'] + 
                                    df_features['urban_address'] +  # Área urbana = mais recursos
                                    (df_features['traveltime'] <= 2))  # Tempo de viagem baixo
    
    # Indicador de idade adequada (15-17 anos são considerados normais)
df_features['appropriate_age'] = ((df_features['age'] >= 15) & (df_features['age'] <= 17)).astype(int)
    
    # Indicador de possível repetência (idade alta para o nível)
df_features['likely_repeated'] = (df_features['age'] >= 19).astype(int)
    
    # Score de bem-estar geral
df_features['wellbeing_score'] = (df_features['health'] + 
                                    df_features['famrel'] + 
                                    (5 - df_features['alcohol_total']/2))  # Penaliza álcool
    
    # Indicador de saúde comprometida
df_features['health_issues'] = (df_features['health'] <= 2).astype(int)

job_categories = ['teacher', 'health', 'services', 'at_home', 'other']
    
    # Assumindo que as jobs já estão codificadas numericamente
    # Se precisar decodificar, ajuste conforme seus dados originais
    
    # Score de prestígio profissional dos pais (baseado nas categorias mais comuns)
    # 0: teacher, 1: health, 2: services, 3: at_home, 4: other
prestige_map = {0: 4, 1: 4, 2: 3, 3: 1, 4: 2}  # teacher e health = alto prestígio
df_features['parent_job_prestige'] = (df_features['Mjob'].map(prestige_map).fillna(2) + 
                                        df_features['Fjob'].map(prestige_map).fillna(2)) / 2
    
    # Interação entre educação dos pais e suporte
df_features['edu_support_interaction'] = df_features['parents_avg_edu'] * df_features['total_support']
    
    # Interação entre tempo de estudo e falhas passadas
df_features['study_failure_interaction'] = df_features['studytime'] * (1 + df_features['failures'])
    
    # Interação entre idade e falhas (repetência)
df_features['age_failure_interaction'] = df_features['age'] * (1 + df_features['failures'])

    # Normalizar algumas features por z-score dentro do próprio dataset
numeric_features = ['age', 'absences', 'studytime', 'freetime', 'goout']
    
for feature in numeric_features:
    if feature in df_features.columns:
        mean_val = df_features[feature].mean()
        std_val = df_features[feature].std()
        df_features[f'{feature}_normalized'] = (df_features[feature] - mean_val) / (std_val + 1e-8)
    
 
    # Indicador de estudante equilibrado
df_features['balanced_student'] = ((df_features['studytime'] >= 2) & 
                                     (df_features['freetime'] >= 2) & 
                                     (df_features['goout'] <= 3) & 
                                     (df_features['alcohol_total'] <= 4)).astype(int)
    
    # Score de risco comportamental
df_features['behavioral_risk'] = (df_features['alcohol_problem'] * 2 + 
                                    df_features['romantic'] +  # Relacionamento pode distrair
                                    (df_features['goout'] >= 4).astype(int) + 
                                    (df_features['absences'] >= 8).astype(int))
    
    # =============================================================================
    # 11. FEATURES AGREGADAS POR GRUPOS
    # =============================================================================
    
    # Score de vantagem socioeconômica
df_features['socioeconomic_advantage'] = (df_features['parents_avg_edu'] + 
                                        df_features['urban_address'] + 
                                        df_features['internet'] + 
                                        df_features['resource_access'])
    
    # Score de suporte total expandido
df_features['comprehensive_support'] = (df_features['total_support'] + 
                                        df_features['extra_resources'] + 
                                        df_features['family_stability'] + 
                                        df_features['high_parent_edu'])

df = df_features

In [18]:
def select_best_features(df_features, target_col='target', correlation_threshold=0.05):
    """
    Seleciona as melhores features baseado em correlação com o target
    """
    # Calcular correlações com o target
    correlations = df_features.corr()[target_col].abs().sort_values(ascending=False)
    
    # Filtrar features com correlação significativa
    significant_features = correlations[correlations >= correlation_threshold].index.tolist()
    
    # Remover o próprio target da lista
    if target_col in significant_features:
        significant_features.remove(target_col)
    
    print(f"Features selecionadas ({len(significant_features)}):")
    for feature in significant_features:
        corr_value = correlations[feature]
        print(f"  {feature}: {corr_value:.3f}")
    
    return significant_features

best_features = select_best_features(df)

Features selecionadas (54):
  failures: 0.393
  age_failure_interaction: 0.386
  at_risk_student: 0.356
  academic_commitment: 0.353
  higher: 0.332
  socioeconomic_advantage: 0.278
  studytime: 0.250
  studytime_normalized: 0.250
  parents_avg_edu: 0.249
  Medu: 0.240
  behavioral_risk: 0.222
  Fedu: 0.212
  Dalc: 0.205
  resource_access: 0.204
  alcohol_total: 0.204
  low_parent_edu: 0.203
  dedicated_student: 0.201
  alcohol_problem: 0.195
  balanced_student: 0.192
  high_parent_edu: 0.189
  Walc: 0.177
  urban_address: 0.168
  likely_repeated: 0.165
  internet: 0.150
  Mjob: 0.148
  social_vs_study: 0.140
  is_male: 0.129
  comprehensive_support: 0.128
  traveltime: 0.127
  social_score: 0.127
  freetime: 0.123
  freetime_normalized: 0.123
  study_failure_interaction: 0.116
  appropriate_age: 0.110
  age_normalized: 0.107
  age: 0.107
  health: 0.099
  absences_normalized: 0.091
  absences: 0.091
  romantic: 0.091
  goout: 0.088
  goout_normalized: 0.088
  edu_support_interaction: 

In [20]:
df = df[best_features]
df['target'] = y
print(df.columns)
print(df)

Index(['failures', 'age_failure_interaction', 'at_risk_student',
       'academic_commitment', 'higher', 'socioeconomic_advantage', 'studytime',
       'studytime_normalized', 'parents_avg_edu', 'Medu', 'behavioral_risk',
       'Fedu', 'Dalc', 'resource_access', 'alcohol_total', 'low_parent_edu',
       'dedicated_student', 'alcohol_problem', 'balanced_student',
       'high_parent_edu', 'Walc', 'urban_address', 'likely_repeated',
       'internet', 'Mjob', 'social_vs_study', 'is_male',
       'comprehensive_support', 'traveltime', 'social_score', 'freetime',
       'freetime_normalized', 'study_failure_interaction', 'appropriate_age',
       'age_normalized', 'age', 'health', 'absences_normalized', 'absences',
       'romantic', 'goout', 'goout_normalized', 'edu_support_interaction',
       'wellbeing_score', 'health_issues', 'guardian', 'family_stability',
       'schoolsup', 'famrel', 'activities', 'famsup', 'paid',
       'parent_job_prestige', 'Fjob'],
      dtype='object')
     

In [10]:
df.to_csv('new_features.csv', index=False)