# **Final Project Task 3 - Census Modeling Regression**

In [3]:
import pandas as pd

In [4]:
data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
    "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
    "hours-per-week", "native-country", "income"
]

data = pd.read_csv(data_url, header=None, names=columns, na_values=" ?", skipinitialspace=True)
data.sample(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
20646,19,?,117201,Some-college,10,Never-married,?,Own-child,White,Male,0,0,22,United-States,<=50K
14742,31,?,163890,Some-college,10,Never-married,?,Unmarried,Black,Female,0,0,40,United-States,<=50K
3766,37,Local-gov,264503,HS-grad,9,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,<=50K
22991,36,Private,198841,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
32049,32,Private,313835,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,>50K
12582,31,Private,193285,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K
5228,27,?,330132,HS-grad,9,Never-married,?,Not-in-family,White,Female,0,0,25,United-States,<=50K
8904,38,Federal-gov,307404,Some-college,10,Divorced,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
2302,26,Private,386585,Some-college,10,Divorced,Tech-support,Not-in-family,White,Male,0,0,60,United-States,<=50K
19880,28,Private,186239,Some-college,10,Never-married,Adm-clerical,Unmarried,Black,Female,0,0,40,United-States,<=50K


### Data Preparation

In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor, plot_tree

from sklearn.linear_model import LinearRegression, SGDRegressor, Lasso, HuberRegressor, Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

import matplotlib.pyplot as plt
import seaborn as sns

from math import sqrt

In [6]:
#Import the dataset we will be working on 
data = pd.read_csv("C:\\Users\\Simina\\OneDrive\\ADC\\ML1\\ubb-sociology-ml\\final_project\\data_normalized.csv")
data.sample(20)

Unnamed: 0,age,fnlwgt,education,education-num,sex,capital-gain,capital-loss,hours-per-week,income,workclass_Local-gov,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,age_bin
16501,40,1.913489,HS-grad,-0.42006,0,-0.14592,-0.21666,-0.035429,0,False,...,False,False,False,False,False,False,True,False,False,35-44
26428,34,-0.903523,Some-college,-0.03136,1,-0.14592,-0.21666,-0.035429,0,False,...,False,False,False,False,False,False,True,False,False,25-34
4729,69,4.159408,Some-college,-0.03136,0,-0.14592,-0.21666,-1.655225,0,False,...,False,False,False,False,False,False,True,False,False,65+
8943,22,-0.391739,HS-grad,-0.42006,0,-0.14592,-0.21666,-0.035429,0,False,...,False,False,False,False,False,False,True,False,False,18-24
19250,54,-0.16678,Some-college,-0.03136,1,-0.14592,-0.21666,-0.035429,1,False,...,False,False,False,False,False,False,True,False,False,45-54
23960,37,-1.39399,HS-grad,-0.42006,1,-0.14592,-0.21666,-0.035429,0,False,...,False,False,False,False,False,False,True,False,False,35-44
22873,32,0.078539,11th,-1.197459,1,-0.14592,-0.21666,-0.035429,0,False,...,False,False,False,False,False,False,True,False,False,25-34
5573,33,-0.237146,Bachelors,1.134739,1,-0.14592,-0.21666,-0.035429,0,False,...,False,False,False,False,False,False,True,False,False,25-34
10820,39,0.107312,Bachelors,1.134739,0,-0.14592,-0.21666,-0.845327,0,False,...,False,False,False,False,False,False,True,False,False,35-44
7536,43,-0.658479,HS-grad,-0.42006,1,-0.14592,-0.21666,-0.035429,1,False,...,False,False,False,False,False,False,True,False,False,35-44


In [7]:
# Define features and target variable
target_column = 'hours-per-week'
## Separate features (X) and target variable (y)
X = data.drop(columns=['hours-per-week'])
y = data['hours-per-week']

In [8]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

In [9]:
#Split the data in 3 sets for train, validation and test - 60% train, 20% validation, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

In [10]:
# Preprocessing pipeline for numerical and categorical data 
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])


### Model Training and Experimentation

In [11]:
# Define a function to evaluate models
def evaluate_model(model):
    # Create a pipeline that includes preprocessing, scaling, and model training
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),               
        ('scaler', StandardScaler()),                 
        ('model', model)                              
    ])
    
    # Fit the model on training data
    pipeline.fit(X_train, y_train)
    
    # Predict on validation set
    y_pred_test = pipeline.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred_test)
    mae = mean_absolute_error(y_test, y_pred_test)
    r2 = r2_score(y_test, y_pred_test)
    
    return mse, mae, r2, pipeline

# Initialize models with default settings
models = {
    'SGDRegressor': SGDRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'Ridge': Ridge(),
    'Lasso': Lasso()
}

# Evaluate each model and store results
results = {}
pipelines = {}

for name, model in models.items():
    mse, mae, r2, pipeline = evaluate_model(model)
    results[name] = {'MSE': mse, 'MAE': mae, 'R²': r2}
    pipelines[name] = pipeline

# Convert results to DataFrame for better visualization
results_df = pd.DataFrame(results).T
print(results_df)


                            MSE       MAE        R²
SGDRegressor           0.852521  0.641523  0.160513
DecisionTreeRegressor  1.620170  0.871664 -0.595400
RandomForestRegressor  0.937613  0.682281  0.076722
Ridge                  0.847372  0.637008  0.165583
Lasso                  1.015587  0.619802 -0.000060


In [None]:
# Ridge Regression are cel mai mic MSE, ceea ce înseamnă că face cele mai mici erori pătratice în predicții.
# DecisionTreeRegressor are cel mai mare MSE, sugerând overfitting 

# Lasso are cel mai mic MAE, ceea ce înseamnă că, în medie, predicțiile sale sunt cele mai apropiate de valorile reale.

# Ridge Regression are cel mai bun R² (0.165), dar valoarea este totuși destul de mică, ceea ce sugerează că modelul este cel mai bun model de referință.
# SGDRegressor are o performanță similară cu Ridge, ceea ce indică faptul că datele pot fi modelate liniar.
# DecisionTreeRegressor și Lasso au R² negative, indicând o performanță slabă.

-------------------------------------------------------------------------------------------

# Overall, Ridge Regression pare să fie cel mai echilibrat model, având cele mai bune scoruri la MSE și R², ceea ce sugerează o performanță stabilă.

#### Experimentation

In [None]:

from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression

# Function to apply various transformations and evaluate models 
def experiment_with_features(model, poly_degree=None, scaling=None, select_k_best=None):
    steps = [('preprocessor', preprocessor)]
    
    # Adding scaling 
    if scaling == 'standard':
        steps.append(('scaler', StandardScaler()))
    elif scaling == 'minmax':
        steps.append(('scaler', MinMaxScaler()))
    
    # Adding polynomial features 
    if poly_degree:
        steps.append(('poly', PolynomialFeatures(degree=poly_degree, include_bias=False)))
    
    # Selecting the most relevant features 
    if select_k_best:
        steps.append(('feature_selection', SelectKBest(score_func=f_regression, k=select_k_best)))
    
    steps.append(('model', model))
    
    # Creating the pipeline
    pipeline = Pipeline(steps=steps)
    
    # Training the model 
    pipeline.fit(X_train, y_train)
    
    # Making predictions 
    y_pred_test = pipeline.predict(X_test)
    
    # Calculating performance metrics 
    mse = mean_squared_error(y_test, y_pred_test)
    mae = mean_absolute_error(y_test, y_pred_test)
    r2 = r2_score(y_test, y_pred_test)
    
    return mse, mae, r2, pipeline

# Defining models for experimentation 
models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'SGDRegressor': SGDRegressor(max_iter=1000, tol=1e-3),
    'RandomForestRegressor': RandomForestRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor()
}

# Transformation configurations for experimentation
experiments = [
    {'poly_degree': None, 'scaling': None, 'select_k_best': None},
    {'poly_degree': 2, 'scaling': 'standard', 'select_k_best': None},
    {'poly_degree': None, 'scaling': 'minmax', 'select_k_best': 10},
    {'poly_degree': 2, 'scaling': 'minmax', 'select_k_best': 5},
    {'poly_degree': 3, 'scaling': 'standard', 'select_k_best': None}
]

# Run experiments for each model
experiment_results = {}

for model_name, model in models.items():
    print(f"\nExperimente pentru {model_name}:")
    model_results = []
    
    for i, config in enumerate(experiments):
        mse, mae, r2, _ = experiment_with_features(model, **config)
        print(f"Configurația {i+1}: Poly Degree={config['poly_degree']}, Scaling={config['scaling']}, Select K Best={config['select_k_best']}")
        print(f"MSE: {mse:.4f}, MAE: {mae:.4f}, R²: {r2:.4f}\n")
        
        model_results.append({
            'Config': config,
            'MSE': mse,
            'MAE': mae,
            'R²': r2
        })
    
    experiment_results[model_name] = model_results


Experimente pentru Ridge:
Configurația 1: Poly Degree=None, Scaling=None, Select K Best=None
MSE: 0.8474, MAE: 0.6370, R²: 0.1656

Configurația 2: Poly Degree=2, Scaling=standard, Select K Best=None
MSE: 0.8029, MAE: 0.6198, R²: 0.2094

Configurația 3: Poly Degree=None, Scaling=minmax, Select K Best=10
MSE: 0.8640, MAE: 0.6420, R²: 0.1492

Configurația 4: Poly Degree=2, Scaling=minmax, Select K Best=5
MSE: 0.9301, MAE: 0.6438, R²: 0.0841

Configurația 5: Poly Degree=3, Scaling=standard, Select K Best=None
MSE: 1.0792, MAE: 0.6536, R²: -0.0627


Experimente pentru Lasso:
Configurația 1: Poly Degree=None, Scaling=None, Select K Best=None
MSE: 1.0154, MAE: 0.6197, R²: 0.0001

Configurația 2: Poly Degree=2, Scaling=standard, Select K Best=None
MSE: 1.0156, MAE: 0.6198, R²: -0.0001

Configurația 3: Poly Degree=None, Scaling=minmax, Select K Best=10
MSE: 1.0156, MAE: 0.6198, R²: -0.0001

Configurația 4: Poly Degree=2, Scaling=minmax, Select K Best=5
MSE: 1.0156, MAE: 0.6198, R²: -0.0001

Co



Configurația 1: Poly Degree=None, Scaling=None, Select K Best=None
MSE: 175512719.5449, MAE: 7803.7458, R²: -172829347.6448

Configurația 2: Poly Degree=2, Scaling=standard, Select K Best=None
MSE: 22315842555526564901027840.0000, MAE: 339508515672.1564, R²: -21974661114779189488123904.0000

Configurația 3: Poly Degree=None, Scaling=minmax, Select K Best=10
MSE: 0.8649, MAE: 0.6402, R²: 0.1483

Configurația 4: Poly Degree=2, Scaling=minmax, Select K Best=5
MSE: 0.9305, MAE: 0.6421, R²: 0.0837

Configurația 5: Poly Degree=3, Scaling=standard, Select K Best=None
MSE: 8855567159274145778496345997312.0000, MAE: 139648446602440.7969, R²: -8720176566043416053393857183744.0000


Experimente pentru RandomForestRegressor:
Configurația 1: Poly Degree=None, Scaling=None, Select K Best=None
MSE: 0.9327, MAE: 0.6807, R²: 0.0815

Configurația 2: Poly Degree=2, Scaling=standard, Select K Best=None
MSE: 0.9554, MAE: 0.6896, R²: 0.0592

Configurația 3: Poly Degree=None, Scaling=minmax, Select K Best=10

### Model Evaluation

In [12]:
best_model_name = results_df['MSE'].idxmin()
print(f"Best model: {best_model_name}")

# Evaluate on test set using the best performing model
pipeline_best = pipelines[best_model_name]
y_pred_test = pipeline_best.predict(X_test)

# Calculate metrics on test set
test_mse = mean_squared_error(y_test, y_pred_test)
test_mae = mean_absolute_error(y_test, y_pred_test)
test_r2 = r2_score(y_test, y_pred_test)

print(f"Test Set Metrics for {best_model_name}:")
print(f"MSE: {test_mse}, MAE: {test_mae}, R²: {test_r2}")

Best model: Ridge
Test Set Metrics for Ridge:
MSE: 0.8473721051230994, MAE: 0.6370078257812535, R²: 0.16558315905550725


In [None]:
# Chiar dacă Ridge a fost cel mai performant model dintre cele testate, performanța generală nu este foarte bună. Un R² de 0.166 indică faptul că modelul nu explică bine variația din date.
# Comparativ cu alte modele (precum DecisionTreeRegressor sau RandomForestRegressor), modelul Ridge probabil a reușit să evite suprapotrivirea, dar nu a fost suficient pentru a captura complexitatea relațiilor din date.


In [13]:
if best_model_name == 'Ridge' and hasattr(pipeline_best.named_steps['model'], 'feature_importances_'):
    feature_importances = pipeline_best.named_steps['model'].feature_importances_
    
    # Accesăm preprocessor-ul antrenat
    preprocessor_fitted = pipeline_best.named_steps['preprocessor']

    encoded_feature_names = []
    
    # Căutăm OneHotEncoder în preprocessor
    for name, transformer, cols in preprocessor_fitted.transformers_:
        if name == 'cat':  
            if isinstance(transformer, Pipeline):
                for step_name, step_transformer in transformer.named_steps.items():
                    if isinstance(step_transformer, OneHotEncoder):
                        encoded_feature_names = step_transformer.get_feature_names_out(cols)
                        break
            elif isinstance(transformer, OneHotEncoder):  
                encoded_feature_names = transformer.get_feature_names_out(cols)
            break

    # Combinăm numele caracteristicilor numerice și categoriale
    feature_names = list(encoded_feature_names) + numerical_cols

    # Verificăm dacă lungimea caracteristicilor se potrivește cu importanțele
    if len(feature_names) != len(feature_importances):
        raise ValueError(f"Mismatch: {len(feature_names)} feature names vs {len(feature_importances)} importances.")

    # Creăm DataFrame pentru vizualizare
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Plot Feature Importance
    plt.figure(figsize=(10, 6))
    sns.barplot(y=importance_df['Feature'], x=importance_df['Importance'])
    plt.title('Feature Importance')
    plt.show()
else:
    print(f"Feature importance is not available for {best_model_name}.")

Feature importance is not available for Ridge.
