In [56]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor


df= pd.read_csv('Datos/Limpios/df_valoracion.csv')
lista_variables= ['growth_stage', 'startup', 'b2b_b2c', 'Nombre_sabi']
df = df.drop(lista_variables, axis=1)
X_train, X_test, y_train, y_test = train_test_split(df.drop('valuation_2022', axis=1), 
                                                    df['valuation_2022'], 
                                                    test_size=0.2, 
                                                    random_state=0)

In [28]:
y_train_log = np.log(y_train)


models = {'Linear Regression': LinearRegression(), 
          'Decision Tree': DecisionTreeRegressor(random_state=42), 
          'Random Forest': RandomForestRegressor(random_state=42)}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='r2')
    print(f"{name} R2 score: {scores.mean():.2f} (+/- {scores.std():.2f})")
    print(f"{name} MAE score: {-cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='neg_mean_absolute_error').mean():.2f} (+/- {-cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='neg_mean_absolute_error').std():.2f})")

Linear Regression R2 score: -206.78 (+/- 413.59)
Linear Regression MAE score: 2.84 (+/- -3.71)
Decision Tree R2 score: 0.66 (+/- 0.22)
Decision Tree MAE score: 0.41 (+/- -0.12)
Random Forest R2 score: 0.78 (+/- 0.12)
Random Forest MAE score: 0.41 (+/- -0.09)


In [12]:
from sklearn.metrics import r2_score

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train_log)
y_train_pred_log = model.predict(X_train)
y_train_pred = np.exp(y_train_pred_log)
r2 = r2_score(y_train, y_train_pred)
print(f"Random Forest R2 score (train): {r2:.2f}")
mae = mean_absolute_error(y_train, y_train_pred)
print(f"Random Forest MAE (train): {mae:.2f}")

Random Forest R2 score (train): 0.67


In [16]:
# lo mismo para el test
# deshaciendo el logaritmo de las predicciones
y_test_pred_log = model.predict(X_test)
y_test_pred = np.exp(y_test_pred_log)
r2 = r2_score(y_test, y_test_pred)
print(f"Random Forest R2 score (test): {r2:.2f}")
mae = mean_absolute_error(y_test, y_test_pred)
print(f"Random Forest MAE (test): {mae:.2f}")

Random Forest R2 score (test): 0.13
Random Forest MAE (test): 11.26


In [17]:
# pasando a logaritmo los datos de test
y_test_log = np.log(y_test)

# Hacer la predicción en y_test_log
y_test_pred_log = model.predict(X_test)

# Transformar los valores predichos de y_test_log a valores normales
y_test_pred = np.exp(y_test_pred_log)

# Evaluar el modelo en el conjunto de prueba
r2 = r2_score(y_test_log, y_test_pred_log)
print(f"Random Forest R2 score (test): {r2:.2f}")

Random Forest R2 score (test): 0.42


In [51]:
# stacking 
level0 = [
    ('knn', KNeighborsRegressor()),
    ('cart', DecisionTreeRegressor()),
    ('svm', SVR()),
    ('lr', LinearRegression())
]

# Define the level 1 model with a transformed target regressor
level1 = TweedieRegressor(max_iter=10000, alpha=0.5, link='log', power=0)

# Define the stacked model
stacked_model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

# Fit the model using cross-validation
scores = -cross_val_score(stacked_model, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(stacked_model, X_train, y_train_log, cv=5, scoring='r2')
avg_mae2 = scores.mean()
avg_r22 = scores2.mean()

# Fit the model on all the training data
stacked_model.fit(X_train, y_train_log)

# Make predictions on the test data and transform the predictions to the original scale
y_pred = np.exp(stacked_model.predict(X_test))

# Evaluate the model on the test data
mae2 = mean_absolute_error(y_test, y_pred)
r22 = r2_score(y_test, y_pred)

# Print the results
print(f"Cross-validation MAE: {avg_mae2:.2f}")
print(f"Cross-validation R^2: {avg_r22:.2f}")
print(f"MAE: {mae2:.2f}")
print(f"R^2: {r22:.2f}")

Cross-validation MAE: 0.95
Cross-validation R^2: 0.37
MAE: 11.03
R^2: 0.30


In [52]:
#Bagging Regressor
model = BaggingRegressor(estimator=RandomForestRegressor(), n_estimators=100, random_state=0)
scores = cross_val_score(model, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(model, X_train, y_train_log, cv=5, scoring='r2')
avg_mae7 = -scores.mean()
avg_r27 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae7 = mean_absolute_error(y_test, y_pred)
r27 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae7}")
print(f'R2: {avg_r27}')
print(f"MAE: {mae7}")
print(f"R^2: {r27}")

Cross-validation MAE: 0.5262145576822407
R2: 0.7599954392353386
MAE: 10.834088422310233
R^2: 0.10826508861402262


In [53]:
model = AdaBoostRegressor(random_state=0)
scores = cross_val_score(model, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(model, X_train, y_train_log, cv=5, scoring='r2')
avg_mae9 = -scores.mean()
avg_r29 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae9 = mean_absolute_error(y_test, y_pred)
r29 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae9}")
print(f'R2: {avg_r29}')
print(f"MAE: {mae9}")
print(f"R^2: {r29}")

Cross-validation MAE: 0.4728705496872404
R2: 0.7421827682914215
MAE: 11.255723444870759
R^2: 0.1377643413056413


In [54]:
# gradient boosting
model = GradientBoostingRegressor(random_state=0)
scores = cross_val_score(model, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2= cross_val_score(model, X_train, y_train_log, cv=5, scoring='r2')
avg_mae10 = -scores.mean()
avg_r210 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae10 = mean_absolute_error(y_test, y_pred)
r210 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae10}")
print(f'R2: {avg_r210}')
print(f"MAE: {mae10}")
print(f"R^2: {r210}")

Cross-validation MAE: 0.32703884612077216
R2: 0.7785748022140995
MAE: 10.739996707133814
R^2: 0.09495814312307271


In [57]:
# xgboost
model = XGBRegressor(random_state=0)
scores = cross_val_score(model, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(model, X_train, y_train_log, cv=5, scoring='r2')
avg_mae11 = -scores.mean()
avg_r211 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae11 = mean_absolute_error(y_test, y_pred)
r211 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae11}")
print(f'R2: {avg_r211}')
print(f"MAE: {mae11}")
print(f"R^2: {r211}")

Cross-validation MAE: 0.31750211769910175
R2: 0.7806804171492269
MAE: 11.067302209002277
R^2: 0.1611544858176146


In [58]:
# catboost
model = CatBoostRegressor(random_state=0)
scores = cross_val_score(model, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(model, X_train, y_train_log, cv=5, scoring='r2')
avg_mae12 = -scores.mean()
avg_r212 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae12 = mean_absolute_error(y_test, y_pred)
r212 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae12}")
print(f'R2: {avg_r212}')
print(f"MAE: {mae12}")
print(f"R^2: {r212}")


Learning rate set to 0.027076
0:	learn: 1.6269403	total: 161ms	remaining: 2m 40s
1:	learn: 1.6037787	total: 161ms	remaining: 1m 20s
2:	learn: 1.5795280	total: 162ms	remaining: 53.8s
3:	learn: 1.5568459	total: 162ms	remaining: 40.4s
4:	learn: 1.5332614	total: 163ms	remaining: 32.4s
5:	learn: 1.5129629	total: 164ms	remaining: 27.1s
6:	learn: 1.4917007	total: 164ms	remaining: 23.3s
7:	learn: 1.4690150	total: 165ms	remaining: 20.4s
8:	learn: 1.4497593	total: 165ms	remaining: 18.2s
9:	learn: 1.4330707	total: 166ms	remaining: 16.4s
10:	learn: 1.4149075	total: 167ms	remaining: 15s
11:	learn: 1.3922292	total: 167ms	remaining: 13.8s
12:	learn: 1.3700429	total: 168ms	remaining: 12.8s
13:	learn: 1.3514819	total: 169ms	remaining: 11.9s
14:	learn: 1.3305616	total: 169ms	remaining: 11.1s
15:	learn: 1.3155433	total: 170ms	remaining: 10.4s
16:	learn: 1.2979775	total: 170ms	remaining: 9.85s
17:	learn: 1.2789410	total: 171ms	remaining: 9.33s
18:	learn: 1.2619085	total: 172ms	remaining: 8.86s
19:	learn: 