# Modelos predictivos de ML.

### Dividimos los datos en set de entrenamiento y validación.

In [30]:
X = train_data_v2.drop('SalePrice', axis=1)  # Features (all columns except the target variable)
y = train_data_v2['SalePrice']               # Target variable

# Splitting the dataset into training (e.g., 70%) and validation (e.g., 30%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# X_train, y_train are the training data
# X_test, y_test are the testing data

### Linear Regression

In [31]:
# Create a linear regression model
model_lr = LinearRegression()

# Fit the model on the training set
model_lr.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model_lr.predict(X_test)


# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 1.3421987884171429e+25
R-squared: -7.911754376918291e+25


# Polynomial Regression

In [32]:
# Create a PolynomialFeatures object with the degree of the polynomial
degree = 2  # We can adjust this value
poly = PolynomialFeatures(degree=degree)

# Transform the features into polynomial features
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Create a linear regression model
model = LinearRegression()

# Fit the model on the polynomial-transformed training set
model.fit(X_train_poly, y_train)

# Make predictions on the polynomial-transformed test set
y_pred = model.predict(X_test_poly)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 2113125617703.8271
R-squared: -12456076550747.95


### Random forest regression model

In [33]:
# Create a random forest regression model

rf_model = RandomForestRegressor(n_estimators=500, random_state=42)

# Fit the model on the training set
rf_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.020707223358693207
R-squared: 0.877938747631287


### XGBOOST model

In [34]:
# Create an XGBoost regression model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=500, learning_rate=0.1, random_state=42)

# Fit the model on the training set
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = xgb_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.019698097925591535
R-squared: 0.8838871605125244


# Resultados

In [35]:
test_data_v2 = test_data_v2.drop('SalePrice', axis=1)

In [36]:
predict = xgb_model.predict(test_data_v2)
log_y_pred = np.log(predict)

In [39]:
submission_df = pd.DataFrame({
    'Id': test_data['Id'],
    'SalePrice': log_y_pred
})

In [40]:
submission_df.to_csv('submission.csv',index=False)