# Machine Learning: Modelo supervisado de regresión

In [None]:
# Importar librerías

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.metrics import mean_squared_error

## Lectura de datos

In [None]:
diamonds_train = pd.read_csv('./data/diamonds_train.csv')
diamonds_train.head()

In [None]:
columnas_reordenadas = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'city', 'price']
diamonds_train = diamonds_train[columnas_reordenadas]
diamonds_train

## Análisis exploratorio

### General

In [None]:
diamonds_train.info()

In [None]:
def cat_var(df, cols):
    '''
    Return: a Pandas dataframe object with the following columns:
        - "categorical_variable" => every categorical variable include as an input parameter (string).
        - "number_of_possible_values" => the amount of unique values that can take a given categorical variable (integer).
        - "values" => a list with the posible unique values for every categorical variable (list).

    Input parameters:
        - df -> Pandas dataframe object: a dataframe with categorical variables.
        - cols -> list object: a list with the name (string) of every categorical variable to analyse.
    '''
    cat_list = []
    for col in cols:
        cat = df[col].unique()
        cat_num = len(cat)
        cat_dict = {"categorical_variable":col,
                    "number_of_possible_values":cat_num,
                    "values":cat}
        cat_list.append(cat_dict)
    df = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values", ascending=False)
    return df.reset_index(drop=True)

In [None]:
col_diamonds_train = list(diamonds_train.columns)
cat_diamonds_train = cat_var(diamonds_train, col_diamonds_train)
cat_diamonds_train

In [None]:
diamonds_train.eq(0).sum()

In [None]:
diamonds_train[(diamonds_train == 0).any(axis=1)]

In [None]:
# TRANSFORMAR X
# Calcular la media agrupando por el campo 'carat'
mediana_por_carat = diamonds_train.groupby('carat')['x'].transform('median')

# Rellenar los valores en 'x' con la media correspondiente cuando 'x' es igual a 0
diamonds_train.loc[diamonds_train['x'] == 0, 'x'] = mediana_por_carat

# TRANSFORMAR Y
# Calcular la media agrupando por el campo 'carat'
mediana_por_carat = diamonds_train.groupby('carat')['y'].transform('median')

# Rellenar los valores en 'x' con la media correspondiente cuando 'x' es igual a 0
diamonds_train.loc[diamonds_train['y'] == 0, 'y'] = mediana_por_carat

# TRANSFORMAR Z
# Calcular la media agrupando por el campo 'carat'
mediana_por_carat = diamonds_train.groupby('carat')['z'].transform('median')

# Rellenar los valores en 'x' con la media correspondiente cuando 'x' es igual a 0
diamonds_train.loc[diamonds_train['z'] == 0, 'z'] = mediana_por_carat



In [None]:
diamonds_train.loc[6465]

In [None]:
diamonds_train[(diamonds_train == 0).any(axis=1)]

In [None]:
diamonds_train.eq(0).sum()

In [None]:
# Crear un gráfico de dispersión
plt.figure(figsize=(8, 6))
plt.scatter(diamonds_train['carat'], diamonds_train['price'], alpha=1, color='blue')

# Personalizar el gráfico
plt.title('Gráfico de Dispersión entre Carat y Price')
plt.xlabel('Carat')
plt.ylabel('Price')
plt.grid(True)

# Mostrar el gráfico
plt.show()

In [None]:
# Crear un histplot
sns.histplot(data=diamonds_train, x='carat', bins=20, kde=False, color='blue', edgecolor='black')

# Personalizar el gráfico
plt.title('Histplot de la Variable Numérica')
plt.xlabel('Valor')
plt.ylabel('Frecuencia')

# Mostrar el gráfico
plt.show()


### Variables categóricas

In [None]:
diamonds_train['cut'].value_counts()
# Habría que darles una ordinalidad de mejor a peor: Premium, Ideal, Very Good, Good, Fair (5-1)

In [None]:
diamonds_train['clarity'].value_counts()
# Se podrían agrupar por tipo de claridad

In [None]:
diamonds_train['city'].value_counts()
# Se podría agrupar por continente

In [None]:
diamonds_train['color'].value_counts()

In [None]:
sns.countplot(x='color', data=diamonds_train)
plt.show()

In [None]:
diamonds_train.groupby('color')['price'].mean()

### Variables numéricas

In [None]:
diamonds_train.describe()

In [None]:
diamonds_train_num = diamonds_train.select_dtypes(include=['number'])
diamonds_train_num

In [None]:
sns.set_style("white")
sns.set_color_codes(palette='deep')
f, ax = plt.subplots(figsize=(8, 7))
#Check the new distribution 
sns.distplot(diamonds_train['price'], color="b");
ax.xaxis.grid(False)
ax.set(ylabel="Frequency")
ax.set(xlabel="SalePrice")
ax.set(title="SalePrice distribution")
sns.despine(trim=True, left=True)
plt.show()

In [None]:
# Configurar el tamaño de la figura
plt.figure(figsize=(3, 5))

# Crear un boxplot para la variable "price"
sns.boxplot(y=diamonds_train['price'])

# Añadir etiquetas y título
plt.xlabel('Price')
plt.title('Boxplot de la Variable "Price"')

# Mostrar el gráfico
plt.show()

In [None]:
# Configurar el tamaño de la figura
plt.figure(figsize=(3, 5))

# Crear un boxplot para la variable "price"
sns.boxplot(y=diamonds_train1['depth'])

# Añadir etiquetas y título
plt.xlabel('depth')
plt.title('Boxplot de la Variable "depth"')

# Mostrar el gráfico
plt.show()


In [None]:

# Calcular el rango intercuartílico (IQR)
Q1 = diamonds_train['depth'].quantile(0.25)
Q3 = diamonds_train['depth'].quantile(0.75)
IQR = Q3 - Q1

# Definir límites para identificar outliers
limite_inferior = Q1 - 10 * IQR
#limite_superior = Q3 + 1.5 * IQR

# Filtrar los datos para eliminar outliers
diamonds_train = diamonds_train[(diamonds_train['depth'] >= limite_inferior) & (diamonds_train['depth'] <= limite_superior)]

# Visualizar el DataFrame resultante sin outliers
diamonds_train

In [None]:
# Configurar el tamaño de la figura
plt.figure(figsize=(3, 5))

# Crear un boxplot para la variable "price"
sns.boxplot(y=diamonds_train['table'])

# Añadir etiquetas y título
plt.xlabel('table')
plt.title('Boxplot de la Variable "table"')

# Mostrar el gráfico
plt.show()


In [None]:

# Calcular el rango intercuartílico (IQR)
Q1 = diamonds_train['table'].quantile(0.25)
Q3 = diamonds_train['table'].quantile(0.75)
IQR = Q3 - Q1

# Definir límites para identificar outliers
limite_inferior = Q1 - 2 * IQR
limite_superior = Q3 + 4 * IQR

# Filtrar los datos para eliminar outliers
diamonds_train = diamonds_train[(diamonds_train['table'] >= limite_inferior) & (diamonds_train['table'] <= limite_superior)]

# Visualizar el DataFrame resultante sin outliers
diamonds_train

In [None]:
# Configurar el tamaño de la figura
plt.figure(figsize=(3, 5))

# Crear un boxplot para la variable "price"
sns.boxplot(y=diamonds_train['carat'])

# Añadir etiquetas y título
plt.xlabel('table')
plt.title('Boxplot de la Variable "carat"')

# Mostrar el gráfico
plt.show()

In [None]:

# Calcular el rango intercuartílico (IQR)
#Q1 = diamonds_train['carat'].quantile(0.25)
#Q3 = diamonds_train['carat'].quantile(0.75)
#IQR = Q3 - Q1

# Definir límites para identificar outliers
#limite_inferior = Q1 - 1.75 * IQR
#limite_superior = Q3 + 1.75 * IQR

# Filtrar los datos para eliminar outliers
#diamonds_train = diamonds_train[(diamonds_train['carat'] <= limite_superior)]

# Visualizar el DataFrame resultante sin outliers
#diamonds_train

In [None]:
# Calcular la matriz de correlación
correlation_matrix = diamonds_train_num.corr()

# Configurar el tamaño de la figura
plt.figure(figsize=(8, 6))

# Crear un mapa de calor (heatmap) con seaborn
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)

# Añadir título
plt.title('Matriz de Correlación')

# Mostrar el gráfico
plt.show()

In [None]:
# x, y, z muy correlacionadas. Habría que eliminar dos variables o hacer una media de las 3.
# carat también muy correlacionada. Se puede eliminar o quizás habría que mantenerla porque está muy correlacionada con price
# Eliminar la variable depth porque apenas tiene correlacion con la variable target y hay muchos outliers (ruido)

## Transformación de variables

### Eliminación variables

In [None]:
# Probamos con hacer una media entre x, y, z:
#diamonds_train['mean'] = diamonds_train['x'] * diamonds_train['y'] * diamonds_train['z']
#diamonds_train['mean'] = diamonds_train[['x','y','z']].mean(axis=1)
#diamonds_train['mean']

In [None]:
#diamonds_train['mean_carat'] = diamonds_train['mean'] * diamonds_train['carat']

In [None]:
diamonds_train_num = diamonds_train.select_dtypes(include=['number'])
diamonds_train_num

In [None]:
# Calcular la matriz de correlación
correlation_matrix = diamonds_train_num.corr()

# Configurar el tamaño de la figura
plt.figure(figsize=(8, 6))

# Crear un mapa de calor (heatmap) con seaborn
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)

# Añadir título
plt.title('Matriz de Correlación')

# Mostrar el gráfico
plt.show()

In [None]:
diamonds_train.describe()

In [None]:
# Eliminación de variables x, y , z y depth
diamonds_train = diamonds_train.drop(columns=['city'])
diamonds_train

In [None]:
# Label encoder
def ordinal_encoding(x):
    for key in encoding:
        if x == key:
            return encoding[key]

In [None]:
# Label Encoding variable clarity
encoding = {'IF': 8, 'VVS1':7, 'VVS2':6, 'VS1':5, 'VS2':4, 'SI1':3, 'SI2':2, 'I1':1}
diamonds_train['clarity'] = diamonds_train['clarity'].apply(ordinal_encoding)
# Label Encoding variable cut
encoding = {'Premium': 4, 'Ideal':5, 'Very Good':3, 'Good':2, 'Fair':1}
diamonds_train['cut'] = diamonds_train['cut'].apply(ordinal_encoding)
# Label Encoding variable cut
encoding = {'D': 7, 'E':6, 'F':5, 'G':4, 'H':3, 'I':2, 'J':1}
diamonds_train['color'] = diamonds_train['color'].apply(ordinal_encoding)

In [None]:

# Inicializar el LabelEncoder
#label_encoder = LabelEncoder()

# Ajustar y transformar las etiquetas categóricas

#diamonds_train['color'] = label_encoder.fit_transform(diamonds_train['color'])


In [None]:
# Calcular la matriz de correlación
correlation_matrix = diamonds_train.corr()

# Configurar el tamaño de la figura
plt.figure(figsize=(8, 6))

# Crear un mapa de calor (heatmap) con seaborn
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)

# Añadir título
plt.title('Matriz de Correlación')

# Mostrar el gráfico
plt.show()

In [None]:
# Nos quedamos con todo menos el target
diamonds_train_x = diamonds_train.drop(columns=['price'])

In [None]:
diamonds_train_y = diamonds_train['price']
diamonds_train_y

### Escalado

In [None]:
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(diamonds_train_x)
diamonds_train_x_scaled = pd.DataFrame(scaled_data)
diamonds_train_x_scaled

## Modelado

### Random Forest

In [None]:
%%time

model_random = RandomForestRegressor(random_state = 42)


scores = cross_val_score(model_random, 
                         diamonds_train_x, 
                         diamonds_train_y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5,
                         n_jobs=-1)

print(scores, '\n')
print(np.mean(-scores), '\n')

### XGBRegressor

In [None]:
model_xgb = XGBRegressor(colsample_bytree = 0.95,
                          gamma = 0.14,
                          learning_rate = 0.012,
                          max_depth = 7,
                          missing = np.inf,
                          n_estimators = 1130,
                          subsample = 0.8, 
                         objective='reg:squarederror', random_state=42)

### RMSE - Cross Validation

In [None]:
%%time


scores = cross_val_score(model_random, 
                         diamonds_train_x, 
                         diamonds_train_y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5,
                         n_jobs=-1)

print(scores, '\n')
print(np.mean(-scores), '\n')

## Test

In [None]:
diamonds_test_x = pd.read_csv('./data/diamonds_test.csv')
diamonds_test_x.head()

In [None]:
# Configurar el tamaño de la figura
plt.figure(figsize=(3, 5))

# Crear un boxplot para la variable "price"
sns.boxplot(y=diamonds_test_x['x'])

# Añadir etiquetas y título
plt.xlabel('carat')
plt.title('Boxplot de la Variable "carat"')

# Mostrar el gráfico
plt.show()

In [None]:
diamonds_test_x.eq(0).sum()

In [None]:
# Probamos con hacer una media entre x, y, z:
#diamonds_test_x['mean'] = diamonds_test_x[['x','y','z']].mean(axis=1)
# Eliminacion de columnas id, x, y, z
diamonds_test_x = diamonds_test_x.drop(columns=['id','city'])
# Label Encoding variable clarity
encoding = {'IF': 8, 'VVS1':7, 'VVS2':6, 'VS1':5, 'VS2':4, 'SI1':3, 'SI2':2, 'I1':1}
diamonds_test_x['clarity'] = diamonds_test_x['clarity'].apply(ordinal_encoding)
# Label Encoding variable cut
encoding = {'Premium': 4, 'Ideal':5, 'Very Good':3, 'Good':2, 'Fair':1}
diamonds_test_x['cut'] = diamonds_test_x['cut'].apply(ordinal_encoding)
# Label Encoding variable cut
encoding = {'D': 7, 'E':6, 'F':5, 'G':4, 'H':3, 'I':2, 'J':1}
diamonds_test_x['color'] = diamonds_test_x['color'].apply(ordinal_encoding)
diamonds_test_x

In [None]:
# TRANSFORMAR X
# Calcular la media agrupando por el campo 'carat'
mediana_por_carat = diamonds_test_x.groupby('carat')['x'].transform('median')

# Rellenar los valores en 'x' con la media correspondiente cuando 'x' es igual a 0
diamonds_test_x.loc[diamonds_test_x['x'] == 0, 'x'] = mediana_por_carat

# TRANSFORMAR Y
# Calcular la media agrupando por el campo 'carat'
mediana_por_carat = diamonds_test_x.groupby('carat')['y'].transform('median')

# Rellenar los valores en 'x' con la media correspondiente cuando 'x' es igual a 0
diamonds_test_x.loc[diamonds_test_x['y'] == 0, 'y'] = mediana_por_carat

# TRANSFORMAR Z
# Calcular la media agrupando por el campo 'carat'
mediana_por_carat = diamonds_train.groupby('carat')['z'].transform('median')

# Rellenar los valores en 'x' con la media correspondiente cuando 'x' es igual a 0
diamonds_test_x.loc[diamonds_test_x['z'] == 0, 'z'] = mediana_por_carat


In [None]:
# Calcular la matriz de correlación
correlation_matrix = diamonds_test_x.corr()

# Configurar el tamaño de la figura
plt.figure(figsize=(8, 6))

# Crear un mapa de calor (heatmap) con seaborn
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)

# Añadir título
plt.title('Matriz de Correlación')

# Mostrar el gráfico
plt.show()

In [None]:
# Escalado
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(diamonds_test_x)
diamonds_test_x_scaled = pd.DataFrame(scaled_data)
diamonds_test_x_scaled

## Submission

In [None]:
model_random.fit(diamonds_train_x, diamonds_train_y)
y_pred = model_random.predict(diamonds_test_x)
y_pred

In [None]:
diamonds_test_x

In [None]:
diamonds_train_x

In [None]:
diamonds_train_y

In [None]:
y_pred_df = pd.DataFrame(y_pred)
y_pred_df.reset_index(inplace=True)
y_pred_df.columns = ['id', 'price']
y_pred_df.to_csv('./data/submisions1.csv', index=False)

## Grid Search

In [None]:
# Random Forest Regressor

param_grid = {'n_estimators': [100, 200, 300],  # Number of trees in the forest.
              'max_depth': [None, 3, 10],  # Maximum depth of the trees.
              'min_samples_split': [2, 10],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1, 4],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split.
              }

In [None]:
grid_search = GridSearchCV(model,
                           param_grid,
                           cv=5,
                           verbose=3,
                           scoring='neg_root_mean_squared_error',
                           n_jobs=-1)

In [None]:
%%time

grid_search.fit(diamonds_train_x_scaled,diamonds_train_y)

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')

In [None]:
y_pred = model.predict(diamonds_test_x_scaled)

In [None]:
# Random Forest Regressor

param_grid = {'n_estimators': [500,700, 900],  # Number of trees in the forest.
              'max_depth': [None],  # Maximum depth of the trees.
              'min_samples_split': [5, 6, 8],  # Minimum number of samples required to split an internal node.
              'min_samples_leaf': [1],  # Minimum number of samples required to be at a leaf node.
              'max_features': [None]  # Number of features to consider when looking for the best split.
             }

In [None]:
%%time

grid_search.fit(diamonds_train_x_scaled,diamonds_train_y)

print('\n')
print('Best hyperparameters: ', grid_search.best_params_, '\n')
print('Best score: ', -grid_search.best_score_, '\n')