In [6]:
from sklearn.model_selection import train_test_split
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import TweedieRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import r2_score
import pickle

df= pd.read_csv('Datos/Limpios/df_valoracion.csv')


In [7]:
#elimino la columna "Precio/Venta"
df.drop(columns=['Precio/Venta'], inplace=True) 

In [8]:
lista_variables= ['growth_stage', 'startup', 'b2b_b2c', 'Nombre_sabi']
df = df.drop(lista_variables, axis=1)
X_train, X_test, y_train, y_test = train_test_split(df.drop('valuation_2022', axis=1), 
                                                    df['valuation_2022'], 
                                                    test_size=0.2, 
                                                    random_state=0)

In [9]:
y_train_log = np.log(y_train)


models = {'Linear Regression': LinearRegression(), 
          'Decision Tree': DecisionTreeRegressor(random_state=42), 
          'Random Forest': RandomForestRegressor(random_state=42)}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='r2')
    print(f"{name} R2 score: {scores.mean():.2f} (+/- {scores.std():.2f})")
    print(f"{name} MAE score: {-cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='neg_mean_absolute_error').mean():.2f} (+/- {-cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='neg_mean_absolute_error').std():.2f})")


results = []

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='r2')
    mae_scores = -cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='neg_mean_absolute_error')
    result = {'model': name,
              'R2 score': scores.mean().round(2),
              'MAE score': -cross_val_score(model, X_train, y_train_log.ravel(), cv=5, scoring='neg_mean_absolute_error').mean().round(2)
              }
    results.append(result)    

Linear Regression R2 score: -230.10 (+/- 459.37)
Linear Regression MAE score: 3.12 (+/- -3.93)
Decision Tree R2 score: 0.57 (+/- 0.27)
Decision Tree MAE score: 0.45 (+/- -0.14)
Random Forest R2 score: 0.74 (+/- 0.11)
Random Forest MAE score: 0.46 (+/- -0.08)


In [10]:
results

[{'model': 'Linear Regression', 'R2 score': -230.1, 'MAE score': 3.12},
 {'model': 'Decision Tree', 'R2 score': 0.57, 'MAE score': 0.45},
 {'model': 'Random Forest', 'R2 score': 0.74, 'MAE score': 0.46}]

In [11]:
linear_regression_mae = results[0]['MAE score']
linear_regression_r2 = results[0]['R2 score']
decision_tree_mae = results[1]['MAE score']
decision_tree_r2 = results[1]['R2 score']
random_forest_mae = results[2]['MAE score']
random_forest_r2 = results[2]['R2 score']

In [12]:
# stacking 
level0 = [
    ('knn', KNeighborsRegressor()),
    ('cart', DecisionTreeRegressor()),
    ('svm', SVR()),
    ('lr', LinearRegression())
]

# Define the level 1 model with a transformed target regressor
level1 = TweedieRegressor(max_iter=10000, alpha=0.5, link='log', power=0)

# Define the stacked model
stacked_model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)

# Fit the model using cross-validation
scores = -cross_val_score(stacked_model, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(stacked_model, X_train, y_train_log, cv=5, scoring='r2')
avg_mae2 = scores.mean()
avg_r22 = scores2.mean()

# Fit the model on all the training data
stacked_model.fit(X_train, y_train_log)

# Make predictions on the test data and transform the predictions to the original scale
y_pred = np.exp(stacked_model.predict(X_test))

# Evaluate the model on the test data
mae2 = mean_absolute_error(y_test, y_pred)
r22 = r2_score(y_test, y_pred)

# Print the results
print(f"Cross-validation MAE: {avg_mae2:.2f}")
print(f"Cross-validation R^2: {avg_r22:.2f}")
print(f"MAE: {mae2:.2f}")
print(f"R^2: {r22:.2f}")

Cross-validation MAE: 0.94
Cross-validation R^2: 0.26
MAE: 11.67
R^2: 0.26


In [13]:
#Bagging Regressor
br = BaggingRegressor(estimator=RandomForestRegressor(), n_estimators=100, random_state=0)
scores = cross_val_score(br, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(br, X_train, y_train_log, cv=5, scoring='r2')
avg_mae7 = -scores.mean()
avg_r27 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae7 = mean_absolute_error(y_test, y_pred)
r27 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae7}")
print(f'R2: {avg_r27}')
print(f"MAE: {mae7}")
print(f"R^2: {r27}")

Cross-validation MAE: 0.5768082917134805
R2: 0.7184319513144224
MAE: 12.011603904128803
R^2: 0.14760518500862274


In [14]:
# adaboost
ab = AdaBoostRegressor(random_state=0)
scores = cross_val_score(ab, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(ab, X_train, y_train_log, cv=5, scoring='r2')
avg_mae9 = -scores.mean()
avg_r29 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae9 = mean_absolute_error(y_test, y_pred)
r29 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae9}")
print(f'R2: {avg_r29}')
print(f"MAE: {mae9}")
print(f"R^2: {r29}")

Cross-validation MAE: 0.534533314916719
R2: 0.7011119597896229
MAE: 12.011603904128803
R^2: 0.14760518500862274


In [15]:
# gradient boosting
gb = GradientBoostingRegressor(random_state=0)
scores = cross_val_score(gb, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2= cross_val_score(gb, X_train, y_train_log, cv=5, scoring='r2')
avg_mae10 = -scores.mean()
avg_r210 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae10 = mean_absolute_error(y_test, y_pred)
r210 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae10}")
print(f'R2: {avg_r210}')
print(f"MAE: {mae10}")
print(f"R^2: {r210}")

Cross-validation MAE: 0.38673868010568174
R2: 0.7073506215313576
MAE: 12.011603904128803
R^2: 0.14760518500862274


In [16]:
# xgboost
xgb = XGBRegressor(random_state=0)
scores = cross_val_score(xgb, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(xgb, X_train, y_train_log, cv=5, scoring='r2')
avg_mae11 = -scores.mean()
avg_r211 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae11 = mean_absolute_error(y_test, y_pred)
r211 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae11}")
print(f'R2: {avg_r211}')
print(f"MAE: {mae11}")
print(f"R^2: {r211}")

Cross-validation MAE: 0.453876275637455
R2: 0.636075954984601
MAE: 12.011603904128803
R^2: 0.14760518500862274


In [17]:
# catboost
cb = CatBoostRegressor(random_state=0)
scores = cross_val_score(cb, X_train, y_train_log, cv=5, scoring='neg_mean_absolute_error')
scores2 = cross_val_score(cb, X_train, y_train_log, cv=5, scoring='r2')
avg_mae12 = -scores.mean()
avg_r212 = scores2.mean()

# ajustar el modelo a todos los datos de entrenamiento
model.fit(X_train, y_train_log)
# hacer predicciones en los datos de prueba y desescalar las predicciones
y_pred = np.exp(model.predict(X_test))

# evaluar el modelo en los datos de prueba
mae12 = mean_absolute_error(y_test, y_pred)
r212 = r2_score(y_test, y_pred)

# imprimir los resultados
print(f"Cross-validation MAE: {avg_mae12}")
print(f'R2: {avg_r212}')
print(f"MAE: {mae12}")
print(f"R^2: {r212}")

Learning rate set to 0.027076
0:	learn: 1.6257059	total: 198ms	remaining: 3m 17s
1:	learn: 1.6022663	total: 200ms	remaining: 1m 39s
2:	learn: 1.5786191	total: 201ms	remaining: 1m 6s
3:	learn: 1.5539615	total: 202ms	remaining: 50.4s
4:	learn: 1.5332605	total: 204ms	remaining: 40.6s
5:	learn: 1.5100673	total: 206ms	remaining: 34.1s
6:	learn: 1.4863208	total: 208ms	remaining: 29.4s
7:	learn: 1.4647026	total: 209ms	remaining: 25.9s
8:	learn: 1.4452631	total: 210ms	remaining: 23.1s
9:	learn: 1.4265305	total: 211ms	remaining: 20.9s
10:	learn: 1.4067376	total: 212ms	remaining: 19.1s
11:	learn: 1.3871794	total: 213ms	remaining: 17.5s
12:	learn: 1.3658186	total: 214ms	remaining: 16.2s
13:	learn: 1.3446838	total: 215ms	remaining: 15.1s
14:	learn: 1.3244923	total: 216ms	remaining: 14.2s
15:	learn: 1.3078790	total: 217ms	remaining: 13.3s
16:	learn: 1.2915581	total: 217ms	remaining: 12.6s
17:	learn: 1.2760632	total: 218ms	remaining: 11.9s
18:	learn: 1.2588776	total: 220ms	remaining: 11.3s
19:	learn

In [18]:
import pickle

In [19]:
# se guardan los modelos que mejor resultado han dado
# se crea carpeta de modelos si no existe
if not os.path.exists('modelos'):
    os.makedirs('modelos')

# dentro de esa carpeta se crea otra carpeta para los modelos de regresion
if not os.path.exists('modelos/regresion'):
    os.makedirs('modelos/regresion')

# se guarda el modelo de bagging  en la carpeta de modelos
pickle.dump(br, open('modelos/regresion/br_model.pkl', 'wb'))

# se guarda el modelo de adaboost  en la carpeta de modelos
pickle.dump(ab, open('modelos/regresion/ad_model.pkl', 'wb'))

# se guarda el modelo de gradientboost  en la carpeta de modelos
pickle.dump(gb, open('modelos/regresion/gb_model.pkl', 'wb'))

# se guarda el modelo de xgboost  en la carpeta de modelos
pickle.dump(xgb, open('modelos/regresion/gb_model.pkl', 'wb'))

# se guarda el modelo de catboost  en la carpeta de modelos
pickle.dump(cb, open('modelos/regresion/cb_model.pkl', 'wb'))

**TABLA DE RESULTADOS**

In [20]:
models = pd.DataFrame([
    {'Model': 'Linear Regression', 'MAE': linear_regression_mae, 'R2 score': linear_regression_r2},
    {'Model': 'Decision Tree Regressor', 'MAE': decision_tree_mae, 'R2 score': decision_tree_r2},
    {'Model': 'Random Forest Regressor', 'MAE': random_forest_mae, 'R2 score': random_forest_r2},
    {'Model': 'Stacking Regressor', 'MAE': avg_mae2, 'R2 score': avg_r22},
    {'Model': 'Bagging Regressor', 'MAE': avg_mae7, 'R2 score': avg_r27},
    {'Model': 'AdaBoost Regressor', 'MAE': avg_mae9, 'R2 score': avg_r29},
    {'Model': 'GradientBoost Regressor', 'MAE': avg_mae10, 'R2 score': avg_r210},
    {'Model': 'XGBoost', 'MAE': avg_mae11, 'R2 score': avg_r211},
    {'Model': 'CatBoost', 'MAE': avg_mae12, 'R2 score': avg_r212}
    # {'Model': 'AutoML', 'MAE': avg_mae17, 'Score': avg_r217}
])

models = models.sort_values(by='R2 score', ascending=False)
models = models.reindex(columns=['Model', 'R2 score', 'MAE'])
models

Unnamed: 0,Model,R2 score,MAE
2,Random Forest Regressor,0.74,0.46
8,CatBoost,0.725286,0.416419
4,Bagging Regressor,0.718432,0.576808
6,GradientBoost Regressor,0.707351,0.386739
5,AdaBoost Regressor,0.701112,0.534533
7,XGBoost,0.636076,0.453876
1,Decision Tree Regressor,0.57,0.45
3,Stacking Regressor,0.264442,0.936655
0,Linear Regression,-230.1,3.12
