In [20]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import r2_score
import pickle

df= pd.read_csv('Datos/Limpios/df_valoracion.csv')
#elimino la columna "Precio/Venta"
df.drop(columns=['Precio/Venta'], inplace=True) 
df.drop(columns=['PER'], inplace=True) 

In [21]:
lista_variables= ['growth_stage', 'startup', 'b2b_b2c', 'Nombre_sabi']
df = df.drop(lista_variables, axis=1)
X_train, X_test, y_train, y_test = train_test_split(df.drop('valuation_2022', axis=1), 
                                                    df['valuation_2022'], 
                                                    test_size=0.2, 
                                                    random_state=0)

--------------------------------

In [22]:
y_train_log = np.log(y_train)


models = {'Linear Regression': LinearRegression(), 
          'Decision Tree': DecisionTreeRegressor(random_state=42), 
          'Random Forest': RandomForestRegressor(random_state=42)}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='r2')
    print(f"{name} R2 score: {scores.mean():.2f} (+/- {scores.std():.2f})")
    print(f"{name} MAE score: {-cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='neg_mean_absolute_error').mean():.2f} (+/- {-cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='neg_mean_absolute_error').std():.2f})")


results = []

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='r2')
    mae_scores = -cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='neg_mean_absolute_error')
    result = {'model': name,
              'R2 score': scores.mean().round(2),
              'MAE score': -cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='neg_mean_absolute_error').mean().round(2)
              }
    results.append(result)    

Linear Regression R2 score: -130.30 (+/- 259.35)
Linear Regression MAE score: 2.67 (+/- -2.97)
Decision Tree R2 score: 0.55 (+/- 0.32)
Decision Tree MAE score: 0.47 (+/- -0.19)
Random Forest R2 score: 0.72 (+/- 0.11)
Random Forest MAE score: 0.47 (+/- -0.07)


In [23]:
results

[{'model': 'Linear Regression', 'R2 score': -130.3, 'MAE score': 2.67},
 {'model': 'Decision Tree', 'R2 score': 0.55, 'MAE score': 0.47},
 {'model': 'Random Forest', 'R2 score': 0.72, 'MAE score': 0.47}]

In [24]:
linear_regression_mae = results[0]['MAE score']
linear_regression_r2 = results[0]['R2 score']
decision_tree_mae = results[1]['MAE score']
decision_tree_r2 = results[1]['R2 score']
random_forest_mae = results[2]['MAE score']
random_forest_r2 = results[2]['R2 score']

In [25]:
# stacking 
level0 = [
    ('knn', KNeighborsRegressor()),
    ('cart', DecisionTreeRegressor()),
    ('svm', SVR()),
    ('lr', LinearRegression())
]

# Define the level 1 model with a transformed target regressor
level1 = TweedieRegressor(max_iter=10000, alpha=0.5, link='log', power=0)

# Define the stacked model
stacked_model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

# Fit the model using cross-validation
scores = -cross_val_score(stacked_model, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(stacked_model, X_train, y_train_log, cv=5, scoring='r2')
avg_mae2 = scores.mean()
avg_r22 = scores2.mean()

# Fit the model on all the training data
stacked_model.fit(X_train, y_train_log)

# Make predictions on the test data and transform the predictions to the original scale
y_pred = np.exp(stacked_model.predict(X_test))

# Evaluate the model on the test data
mae2 = mean_absolute_error(y_test, y_pred)
r22 = r2_score(y_test, y_pred)

# Print the results
print(f"Cross-validation MAE: {avg_mae2:.2f}")
print(f"Cross-validation R^2: {avg_r22:.2f}")
print(f"MAE: {mae2:.2f}")
print(f"R^2: {r22:.2f}")

Cross-validation MAE: 0.97
Cross-validation R^2: 0.25
MAE: 11.79
R^2: 0.24


In [26]:
#Bagging Regressor
br = BaggingRegressor(estimator=RandomForestRegressor(), n_estimators=100, random_state=0)

scores = cross_val_score(br, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(br, X_train, y_train_log, cv=5, scoring='r2')
avg_mae7 = -scores.mean()
avg_r27 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae7 = mean_absolute_error(y_test, y_pred)
r27 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae7}")
print(f'R2: {avg_r27}')
print(f"MAE: {mae7}")
print(f"R^2: {r27}")

Cross-validation MAE: 0.6010124823308549
R2: 0.6862929222943032
MAE: 11.757029662334398
R^2: 0.16755419379906378


In [27]:
# adaboost
ab = AdaBoostRegressor(random_state=0)
scores = cross_val_score(ab, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(ab, X_train, y_train_log, cv=5, scoring='r2')
avg_mae9 = -scores.mean()
avg_r29 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae9 = mean_absolute_error(y_test, y_pred)
r29 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae9}")
print(f'R2: {avg_r29}')
print(f"MAE: {mae9}")
print(f"R^2: {r29}")

Cross-validation MAE: 0.5383800327044871
R2: 0.6830428282035241
MAE: 11.757029662334398
R^2: 0.16755419379906378


In [28]:
# gradient boosting
gb = GradientBoostingRegressor(random_state=0)
scores = cross_val_score(gb, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2= cross_val_score(gb, X_train, y_train_log, cv=5, scoring='r2')
avg_mae10 = -scores.mean()
avg_r210 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae10 = mean_absolute_error(y_test, y_pred)
r210 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae10}")
print(f'R2: {avg_r210}')
print(f"MAE: {mae10}")
print(f"R^2: {r210}")

Cross-validation MAE: 0.4440514699778074
R2: 0.6173879568635016
MAE: 11.757029662334398
R^2: 0.16755419379906378


In [29]:
# xgboost
xgb = XGBRegressor(random_state=0)
scores = cross_val_score(xgb, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(xgb, X_train, y_train_log, cv=5, scoring='r2')
avg_mae11 = -scores.mean()
avg_r211 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae11 = mean_absolute_error(y_test, y_pred)
r211 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae11}")
print(f'R2: {avg_r211}')
print(f"MAE: {mae11}")
print(f"R^2: {r211}")

Cross-validation MAE: 0.4585651311236362
R2: 0.6241603927878009
MAE: 11.757029662334398
R^2: 0.16755419379906378


In [30]:
# catboost
cb = CatBoostRegressor(random_state=0)
scores = cross_val_score(cb, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(cb, X_train, y_train_log, cv=5, scoring='r2')
avg_mae12 = -scores.mean()
avg_r212 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae12 = mean_absolute_error(y_test, y_pred)
r212 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae12}")
print(f'R2: {avg_r212}')
print(f"MAE: {mae12}")
print(f"R^2: {r212}")

Learning rate set to 0.027076
0:	learn: 1.6221013	total: 1.66ms	remaining: 1.66s
1:	learn: 1.5970072	total: 3.48ms	remaining: 1.74s
2:	learn: 1.5738881	total: 4.71ms	remaining: 1.56s
3:	learn: 1.5483577	total: 6.55ms	remaining: 1.63s
4:	learn: 1.5286905	total: 8.47ms	remaining: 1.68s
5:	learn: 1.5051657	total: 9.78ms	remaining: 1.62s
6:	learn: 1.4818450	total: 11.2ms	remaining: 1.58s
7:	learn: 1.4633487	total: 12.3ms	remaining: 1.53s
8:	learn: 1.4429436	total: 13.5ms	remaining: 1.48s
9:	learn: 1.4253598	total: 14.7ms	remaining: 1.45s
10:	learn: 1.4084462	total: 15.9ms	remaining: 1.43s
11:	learn: 1.3886445	total: 17ms	remaining: 1.4s
12:	learn: 1.3668095	total: 18.2ms	remaining: 1.38s
13:	learn: 1.3481332	total: 19.4ms	remaining: 1.37s
14:	learn: 1.3313849	total: 20.6ms	remaining: 1.35s
15:	learn: 1.3129416	total: 21.6ms	remaining: 1.33s
16:	learn: 1.2960785	total: 22.7ms	remaining: 1.31s
17:	learn: 1.2798019	total: 24.1ms	remaining: 1.31s
18:	learn: 1.2638940	total: 25.2ms	remaining: 1

In [31]:
import pickle

In [35]:
# se crean modelos con todos los datos
x= df.drop(columns=['valuation_2022'])
y= df['valuation_2022']

br = BaggingRegressor(estimator=RandomForestRegressor(), n_estimators=100, random_state=0)
br.fit(x,y)

ab = AdaBoostRegressor(random_state=0)
ab.fit(x,y)

rf=RandomForestRegressor(random_state=42)
rf.fit(x,y)

xgb = XGBRegressor(random_state=0)
xgb.fit(x,y)

cb = CatBoostRegressor(random_state=0)
cb.fit(x,y)

Learning rate set to 0.029132
0:	learn: 23.0609722	total: 3.52ms	remaining: 3.51s
1:	learn: 22.8341994	total: 5.58ms	remaining: 2.78s
2:	learn: 22.6972352	total: 7.67ms	remaining: 2.55s
3:	learn: 22.5101439	total: 9.58ms	remaining: 2.38s
4:	learn: 22.3357002	total: 11.7ms	remaining: 2.33s
5:	learn: 22.1632945	total: 14.7ms	remaining: 2.44s
6:	learn: 21.9960033	total: 17.7ms	remaining: 2.51s
7:	learn: 21.8147942	total: 19.4ms	remaining: 2.4s
8:	learn: 21.6037733	total: 21.4ms	remaining: 2.35s
9:	learn: 21.5155031	total: 23.2ms	remaining: 2.3s
10:	learn: 21.2992398	total: 25.2ms	remaining: 2.26s
11:	learn: 21.0912897	total: 27ms	remaining: 2.22s
12:	learn: 20.9410612	total: 28.8ms	remaining: 2.18s
13:	learn: 20.7534862	total: 31.3ms	remaining: 2.2s
14:	learn: 20.6034460	total: 32.8ms	remaining: 2.16s
15:	learn: 20.5127563	total: 34.3ms	remaining: 2.11s
16:	learn: 20.4308443	total: 35.8ms	remaining: 2.07s
17:	learn: 20.2791445	total: 37.1ms	remaining: 2.02s
18:	learn: 20.1228540	total: 38

<catboost.core.CatBoostRegressor at 0x2b07a4c1640>

In [36]:
# se guardan los modelos que mejor resultado han dado
# se crea carpeta de modelos si no existe
if not os.path.exists('modelos'):
    os.makedirs('modelos')

# dentro de esa carpeta se crea otra carpeta para los modelos de regresion
if not os.path.exists('modelos/regresion'):
    os.makedirs('modelos/regresion')

# se guarda el modelo de bagging  en la carpeta de modelos
pickle.dump(br, open('modelos/regresion/br_model.pkl', 'wb'))

# se guarda el modelo de adaboost  en la carpeta de modelos
pickle.dump(ab, open('modelos/regresion/ad_model.pkl', 'wb'))

# se guarda el modelo de gradientboost  en la carpeta de modelos
pickle.dump(rf, open('modelos/regresion/rf_model.pkl', 'wb'))

# se guarda el modelo de xgboost  en la carpeta de modelos
pickle.dump(xgb, open('modelos/regresion/xgb_model.pkl', 'wb'))

# se guarda el modelo de catboost  en la carpeta de modelos
pickle.dump(cb, open('modelos/regresion/cb_model.pkl', 'wb'))

**TABLA DE RESULTADOS**

In [34]:
models = pd.DataFrame([
    {'Model': 'Linear Regression', 'MAE': linear_regression_mae, 'R2 score': linear_regression_r2},
    {'Model': 'Decision Tree Regressor', 'MAE': decision_tree_mae, 'R2 score': decision_tree_r2},
    {'Model': 'Random Forest Regressor', 'MAE': random_forest_mae, 'R2 score': random_forest_r2},
    {'Model': 'Stacking Regressor', 'MAE': avg_mae2, 'R2 score': avg_r22},
    {'Model': 'Bagging Regressor', 'MAE': avg_mae7, 'R2 score': avg_r27},
    {'Model': 'AdaBoost Regressor', 'MAE': avg_mae9, 'R2 score': avg_r29},
    {'Model': 'GradientBoost Regressor', 'MAE': avg_mae10, 'R2 score': avg_r210},
    {'Model': 'XGBoost', 'MAE': avg_mae11, 'R2 score': avg_r211},
    {'Model': 'CatBoost', 'MAE': avg_mae12, 'R2 score': avg_r212}
    # {'Model': 'AutoML', 'MAE': avg_mae17, 'Score': avg_r217}
])

models = models.sort_values(by='R2 score', ascending=False)
models = models.reindex(columns=['Model', 'R2 score', 'MAE'])
models

Unnamed: 0,Model,R2 score,MAE
2,Random Forest Regressor,0.72,0.47
4,Bagging Regressor,0.686293,0.601012
5,AdaBoost Regressor,0.683043,0.53838
8,CatBoost,0.675742,0.442366
7,XGBoost,0.62416,0.458565
6,GradientBoost Regressor,0.617388,0.444051
1,Decision Tree Regressor,0.55,0.47
3,Stacking Regressor,0.245221,0.969683
0,Linear Regression,-130.3,2.67
