In [10]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

from itertools import product

from tqdm import tqdm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

from sklearn.preprocessing import OneHotEncoder

from category_encoders import TargetEncoder

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


import warnings
warnings.filterwarnings('ignore')

In [2]:
def normalize_scaler(data):
    data_copy = data.copy()
    for col in data.columns:
        mean_data = data_copy[col].mean()
        range_data = data_copy[col].max()-data_copy[col].min()
        data_copy[col] = data_copy[col].apply(lambda x: (x-mean_data)/range_data)
    return data_copy

In [6]:
def percent_outs(array):
    length = len(array)
    neg_count = sum(array==-1)
    p_outs = neg_count/length*100
    return p_outs

In [12]:
def metricas(y_train, y_train_pred, y_test, y_test_pred):
    metricas = {
    'train': {
        'r2_score': r2_score(y_train, y_train_pred),
        'MAE': mean_absolute_error(y_train, y_train_pred),
        'MSE': mean_squared_error(y_train, y_train_pred),
        'RMSE': np.sqrt(mean_squared_error(y_train, y_train_pred))
    },
    'test': {
        'r2_score': r2_score(y_test, y_test_pred),
        'MAE': mean_absolute_error(y_test, y_test_pred),
        'MSE': mean_squared_error(y_test, y_test_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_test_pred))

    }
}
    return metricas

In [11]:
params_tree = {
    'max_depth': [6, 8, 10, 20],
    'min_samples_split': [10, 50],
    'min_samples_leaf': [10, 50],
    'max_leaf_nodes': [10, 20, 40, 80, 160]
}

In [None]:
def get_ml_model(df, params_tree, scaler_type = "rob", outlier_threshold = 70, outlier_imputer = RandomForestRegressor()):
    df["postalCode"] = df["postalCode"].astype("string")
    df_numeric = df.select_dtypes("number").drop(columns="price")
    
    if scaler_type == "rob":
        scaler = RobustScaler()
    elif scaler_type == "minmax":
        scaler = MinMaxScaler()
    elif scaler_type == "stand":
        scaler = StandardScaler()
    elif scaler_type == "norm":
        numeric_scaled = normalize_scaler(df_numeric)

    numeric_scaled = pd.DataFrame(scaler.fit_transform(df_numeric), columns=df_numeric.columns)
    
    df[df_numeric.columns] = numeric_scaled
    df_numeric = df.select_dtypes('number')
    ests = np.linspace(1,1000, 5, dtype = int)
    conts = np.linspace(0.01,0.2,5)

    forest_arg_combis = list(product(ests, conts))
    print("Applying IFO")
    for n,m in tqdm(forest_arg_combis):
        iforest = IsolationForest(random_state=42, n_estimators=n, contamination=m, n_jobs=12)
        df[f"iforest_{n}_{m:.3f}"] = iforest.fit_predict(X=df_numeric)
    df_forest = df.filter(like="iforest")
    
    percentages = df_forest.apply(percent_outs, axis=1)
    df_filtered_1 = df[percentages<outlier_threshold]

    df_filtered_1 = df_filtered_1.drop(columns = df_filtered_1.filter(like="iforest").columns)

    df_numeric_filtered_1 = df_filtered_1.select_dtypes("number")
    
    neighs = np.linspace(15,45,5, dtype=int)
    lof_combis = list(product(neighs, conts))
    print("Applying LOF")
    for neighbour, contaminacion in tqdm(lof_combis):
        lof = LocalOutlierFactor(n_neighbors=neighbour, contamination=contaminacion, n_jobs=-1)
        df_filtered_1[f"lof_{neighbour}_{contaminacion:.3f}"] = lof.fit_predict(X = df_numeric_filtered_1)

    df_lof = df_filtered_1.filter(like="lof")

    percentages_filter_1 = df_lof.apply(percent_outs, axis=1)
    outliers = df_filtered_1[percentages_filter_1>outlier_threshold]
    normals = df_filtered_1[percentages_filter_1<outlier_threshold]

    df_filtered_1.loc[outliers.index, "powerCV"] = np.nan
    df_filtered_1.loc[outliers.index, "kilometer"] = np.nan

    df_filtered_1.reset_index(drop=True, inplace=True)
    imputer = IterativeImputer(estimator=outlier_imputer)
    imputed_cols = pd.DataFrame(imputer.fit_transform(X=df_filtered_1[["price", "powerCV", "kilometer"]]), columns = ["price", "powerCV", "kilometer"])

    df_final = df_filtered_1.drop(columns = df_filtered_1.filter(like="lof").columns)
    df_final[["price", "powerCV", "kilometer"]] = imputed_cols

    df_final.drop(columns=['name', 'model', 'postalCode'], inplace=True)

    df = df_final

    df["dateCreated"] = pd.to_datetime(df["dateCreated"]).apply(lambda x: x.strftime("%Y-%m"))
    df["dateCrawled"] = pd.to_datetime(df["dateCrawled"]).apply(lambda x: x.strftime("%Y-%m"))
    df["lastSeen"] = pd.to_datetime(df["lastSeen"]).apply(lambda x: x.strftime("%Y-%m"))

    print("Encoding...")
    onehot = OneHotEncoder()
    trans_one_hot = onehot.fit_transform(df[["abtest"]])
    oh_df = pd.DataFrame(trans_one_hot.toarray(), columns=onehot.get_feature_names_out())

    df = pd.concat([df.reset_index(drop=True), oh_df.reset_index(drop=True)], axis=1)

    df.drop(columns=["seller", "offerType", "abtest"], inplace=True)
    df.reset_index(drop=True, inplace=True)
    encoder = TargetEncoder(cols = df.select_dtypes("O").columns)
    df_encoded = encoder.fit_transform(X = df, y = df["price"])
    df = df_encoded
    print("Creating Model..")
    X = df.drop(columns= ["price"])
    y = df["price"]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(X_train.dtypes)

    decision_tree = DecisionTreeRegressor()
    grid_search = GridSearchCV(estimator=decision_tree, param_grid=params_tree, cv = 5, scoring="neg_mean_squared_error")
    grid_search.fit(X_train, y_train)
    best_tree = grid_search.best_estimator_

    y_train_pred = best_tree.predict(X = X_train)
    y_test_pred = best_tree.predict(X = X_test)

    metrics = pd.DataFrame(metricas(y_train, y_train_pred, y_test, y_test_pred)).T

    display(metrics)

    plt.figure(dpi = 140, figsize = (6,4))
    sns.scatterplot(x = y_test, y = abs(y_test_pred-y_test))
    plt.ylabel("Error absoluto")
    plt.xlabel("Precio real")
    plt.show()
    return best_tree

In [22]:
df = pd.read_csv("datos/df_final.csv")

In [None]:
get_ml_model(df, params_tree=params_tree, scaler_type="rob", outlier_threshold=60)

Applying IFO


100%|██████████| 25/25 [07:40<00:00, 18.40s/it]


Applying LOF


100%|██████████| 25/25 [02:11<00:00,  5.24s/it]


Encoding...
Creating Model..
Unnamed: 0             float64
dateCrawled            float64
vehicleType            float64
yearOfRegistration     float64
gearbox                float64
powerCV                float64
kilometer              float64
monthOfRegistration    float64
fuelType               float64
brand                  float64
notRepairedDamage      float64
dateCreated            float64
lastSeen               float64
lof_15_0.010             int32
lof_15_0.058             int32
lof_15_0.105             int32
lof_15_0.153             int32
lof_15_0.200             int32
lof_22_0.010             int32
lof_22_0.058             int32
lof_22_0.105             int32
lof_22_0.153             int32
lof_22_0.200             int32
lof_30_0.010             int32
lof_30_0.058             int32
lof_30_0.105             int32
lof_30_0.153             int32
lof_30_0.200             int32
lof_37_0.010             int32
lof_37_0.058             int32
lof_37_0.105             int32
lof_37_0.1