In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
url = 'https://raw.githubusercontent.com/digipodium/Datasets/main/regression/kc_house_data.csv'
df = pd.read_csv(url, parse_dates=['date'])
df

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [106]:
X = df.drop(['id', 'date', 'price'], axis=1)
y = df['price']

In [107]:
from sklearn.feature_selection import SelectKBest, f_regression, VarianceThreshold

In [None]:
X.shape

In [None]:
selector_1 = VarianceThreshold(threshold=0.1)
X_selected = selector_1.fit_transform(X)
X_selected.shape

In [None]:
# selected columns
X.columns[selector_1.get_support()].tolist()

In [None]:
# columns dropped
X.columns[~selector_1.get_support()].tolist()

In [None]:
selector_2 = SelectKBest(f_regression, k=10)
X_selected = selector_2.fit_transform(X, y)
X_selected.shape

In [None]:
# selected columns
X.columns[selector_2.get_support()].tolist()

In [None]:
# dropped columns
X.columns[~selector_2.get_support()].tolist()

In [None]:
np.set_printoptions(precision=2, suppress=True)
selector_2.scores_

In [116]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor

In [117]:
model_1 = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(f_regression, k=10)),
    ('regressor', DecisionTreeRegressor(max_depth=10))
])

In [125]:
model_2 = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('selector', VarianceThreshold(threshold=0.2)),
    ('regressor', DecisionTreeRegressor(max_depth=10))
])

In [126]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=0)
model_1.fit(Xtrain, ytrain)
_ = model_2.fit(Xtrain, ytrain)

In [None]:
print("Model 1 Score", model_1.score(Xtest, ytest))   
print("Model 2 Score", model_2.score(Xtest, ytest))   

In [None]:
print("F regression columns: ", X.columns[model_1['selector'].get_support()].to_list())
print("Variance Threshold columns: ", X.columns[model_2['selector'].get_support()].to_list())

In [None]:
skewness = y.skew()# it should be close to 0
fig, ax = plt.subplots(figsize=(10, 5)) 
sns.histplot(y, kde=True,ax=ax)
ax.text(x=2*10**6, y=800, s=f'Skewness: {skewness}', fontsize=16,)
ax.text(x=2*10**6, y=700, s=f'data is very skewed', fontsize=12,)

In [None]:
yt = np.log1p(y) # log transform that reduces skewness
skewness = yt.skew()
fig, ax = plt.subplots(figsize=(10, 5))
sns.histplot(yt, kde=True, ax=ax)
ax.text(x=14, y=800, s=f'Skewness: {skewness:.2f}', fontsize=16,)
ax.text(x=14, y=700, s=f'data is less skewed', fontsize=12,)
plt.show()

In [None]:
# training the final model
model_3 = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('selector', VarianceThreshold(threshold=0.2)),
    ('regressor', DecisionTreeRegressor(max_depth=10))
])
Xtrain, Xtest, ytrain, ytest = train_test_split(X, yt, test_size=0.2, random_state=0)
model_3.fit(Xtrain, ytrain)
print("Model 3 Score", model_3.score(Xtest, ytest))
ypred = model_3.predict(Xtest)

In [None]:
results = pd.DataFrame({
    'Actual': np.expm1(ytest),
    'Predicted': np.expm1(ypred), # expm1 is the inverse of log1p
})

sns.histplot(results['Actual'], kde=True, color='b', label='Actual', alpha=0.5)
sns.histplot(results['Predicted'], kde=True, color='r', label='Predicted', alpha=0.5)
plt.xlim([0, 2000000])
plt.legend()