In [2]:
PATH_INPUT_TRAIN = "../files/input/train_data.csv.zip"
PATH_INPUT_TEST = "../files/input/test_data.csv.zip"


In [None]:
import pandas as pd

df_train = pd.read_csv(
    PATH_INPUT_TRAIN,
    index_col=False,
    compression='zip'
)



Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Driven_kms,Fuel_Type,Selling_type,Transmission,Owner
0,jazz,2016,7.4,8.5,15059,Petrol,Dealer,Automatic,0
1,i10,2013,4.0,4.6,30000,Petrol,Dealer,Manual,0
2,TVS Apache RTR 180,2011,0.5,0.826,6000,Petrol,Individual,Manual,0
3,eon,2016,3.15,4.43,15000,Petrol,Dealer,Manual,0
4,Royal Enfield Thunder 350,2013,1.25,1.5,15000,Petrol,Individual,Manual,0


In [15]:

def limpiar_dataset(df):
    df['Age'] = 2021 - df['Year']
    df = df.drop(columns=["Year"], errors="ignore")
    df = df.drop(columns=["Car_Name"], errors="ignore")
    return df

df_train = pd.read_csv(
    PATH_INPUT_TRAIN,
    index_col=False,
    compression='zip'
)

df_test = pd.read_csv(
    PATH_INPUT_TEST,
    index_col=False,
    compression='zip'
)

# Aplicar limpieza
df_train_clean = limpiar_dataset(df_train)
df_test_clean = limpiar_dataset(df_test)

In [16]:

X_train = df_train_clean.drop(columns="Present_Price", axis=1)
y_train = df_train_clean["Present_Price"]

X_test = df_test_clean.drop(columns="Present_Price",axis=1)
y_test = df_test_clean["Present_Price"]


In [17]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.linear_model import LinearRegression

# Columnas
categorical_features = ['Fuel_Type','Selling_type','Transmission']
numerical_features = [col for col in X_train.columns if col not in categorical_features]



preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', MinMaxScaler(), numerical_features)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('selectk', SelectKBest(f_regression)),
    ('LinearRegression', LinearRegression())   
])

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'selectk__k': [5,10,15,20],
    'LinearRegression__fit_intercept':[True,False],
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=10,
    scoring='neg_mean_absolute_error',
    n_jobs=-1
)

grid.fit(X_train, y_train)



0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'LinearRegression__fit_intercept': [True, False], 'selectk__k': [5, 10, ...]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,10
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False

0,1,2
,score_func,<function f_r...0024CB3020720>
,k,15

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [19]:
import gzip
import os
import pickle

os.makedirs("../files/models", exist_ok=True)
with gzip.open("../files/models/model.pkl.gz", "wb") as f:
    pickle.dump(grid, f)

In [21]:
from sklearn.metrics import (
    r2_score,
    mean_squared_error,
    median_absolute_error
)

# Predecir en train y test
y_pred_train = grid.predict(X_train)
y_pred_test = grid.predict(X_test)

# Calcular métricas
metrics = [
    {
        "type": "metrics",
        "dataset": "train",
        'r2': r2_score(y_train,y_pred_train),
        'mse':mean_squared_error(y_train,y_pred_train),
        'mad':median_absolute_error(y_train,y_pred_train),
    },
    {
        "type": "metrics",
        "dataset": "test",
        'r2':r2_score(y_test,y_pred_test),
        'mse':mean_squared_error(y_test,y_pred_test),
        'mad':median_absolute_error(y_test,y_pred_test),
    }
]

In [22]:
import json

# Crear directorio si no existe
os.makedirs("../files/output", exist_ok=True)

# Guardar en JSON
with open("../files/output/metrics.json", "w") as f:
    for entry in metrics:
        f.write(json.dumps(entry) + "\n")