## Histogram

* Histogram is nonparametric
    * If grid too fine, small bias but large variance
    * If grid too coarse, small variance but large bias
    

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import beta

n = 200
x_base = np.arange(0.01, 1, 0.01)
breaks_list = [4, 12, 60]

fig, axs = plt.subplots(3, 3)
fig.set_size_inches(8, 8)
fig.subplots_adjust(hspace=0.5, wspace=0.5)

for i, ax in enumerate(axs.flat):
    ii = i // 3
    bb = breaks_list[i % 3]
    x = np.random.beta(2, 2, size=n)
    ax.hist(x, bins=bb, density=True)
    ax.plot(x_base, beta.pdf(x_base, 2, 2), color="red")
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 3)
    ax.set_title(f"Breaks={bb}")

plt.show()

## Lasso

In [None]:
import numpy as np
from scipy.linalg import pinv
from sklearn.linear_model import LassoCV, Lasso

# Set up the data
n = 40
p = 50

# the first 10 variables are important
b0 = np.concatenate([np.ones(10), np.zeros(p - 10)])
x = np.random.normal(size=(n, p))
y = np.dot(x, b0) + np.random.normal(size=n)


In [None]:

# OLS
ols = np.dot(pinv(np.dot(x.T, x)), np.dot(x.T, y))

# Lasso
lasso_cv = LassoCV(cv=5).fit(x, y)
lasso_result = Lasso(alpha=lasso_cv.alpha_).fit(x, y)

lasso_result.coef_


In [None]:
import matplotlib.pyplot as plt

# Assuming `ols` and `lasso_result.coef_` are already defined
plt.figure(figsize=(10, 6))
plt.plot(ols, label='OLS Coefficients', marker='o')
plt.plot(lasso_result.coef_, label='Lasso Coefficients', marker='x')
plt.xlabel('Coefficient Index')
plt.ylabel('Coefficient Value')
plt.title('Comparison of OLS and Lasso Coefficients')
plt.legend()
plt.grid(True)
plt.show()

# Prediction-Oriented Methods


# Random Forest

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import numpy as np

# Load California housing dataset
california = fetch_california_housing()
X, y = california.data, california.target
feature_names = california.feature_names

# training Sample with 300 observations
train_indices = np.random.choice(np.arange(len(y)), size=300, replace=False)

# Fit Random Forest model
rf = RandomForestRegressor(n_estimators=500, random_state=101)
# document: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
# "n_estimators" is the number of trees in a forest

rf.fit(X[train_indices], y[train_indices])


In [None]:

# Plot feature importances
feat_importances = rf.feature_importances_
indices = np.argsort(feat_importances)[::-1]
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), feat_importances[indices])
plt.show()

## Real Data Example

Gradient boosting

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import make_scorer, mean_squared_error


In [None]:
lianjia = pd.read_csv("data_example/lianjia.csv",encoding='gbk')

predictors = ["square", "livingRoom", "drawingRoom", "kitchen", "bathRoom",
              "floor_total", "elevator", "ladderRatio",
              "age", "DOM", "followers", "fiveYearsProperty",
              "subway", "district", "Lng", "Lat", "t_trade",
              "communityAverage"]

# Your target variable
target = 'price'

# Prepare your predictor and target datasets

X = lianjia[predictors]
y = lianjia[target]

In [None]:
import time
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# Create the Gradient Boosting Regressor

# Define the hyperparameters
param_grid = {
    'max_depth': [10, 20, 40],  # equivalent to interaction.depth in R's gbm
    'n_estimators': [1000, 5000, 9000],  # equivalent to n.trees. (number of iterations)
    'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1],  # equivalent to shrinkage
    'min_samples_leaf': [10, 15, 20],  # equivalent to n.minobsinnode
}


gbm = GradientBoostingRegressor(loss='squared_error')
# Perform tuning parameter grid search
grid_search = GridSearchCV(estimator = gbm, 
                           param_grid = param_grid, 
                           scoring = 'neg_mean_squared_error', 
                           cv=5)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=0)

grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", -grid_search.best_score_)
print("Time taken:", duration, "seconds")



In [None]:
from sklearn.metrics import r2_score

# Get the best parameters from grid search
best_params = grid_search.best_params_

# Set the best parameters to the new model
gbm.set_params(**best_params)

# Now when you call fit on gbm, it will use the best parameters
gbm.fit(X_train, y_train)


In [None]:
# Predict with GBM model
pred_boosting = gbm.predict(X_test)

# Fit the linear regression model

# Comparison
r_squared_gbm = r2_score(y_test, pred_boosting)

print("R-squared of GBM prediction =", r_squared_gbm)