In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [22]:
# Load the dataset
df = pd.read_csv('../../datasets/kc_house_data.csv')
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [32]:
# Geting the columns for X and y
X = df.drop(['id', 'date', 'price'], axis=1)
y = df['price']

In [37]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=0)

In [None]:
# Define the columns to scale and encode
scale_cols = ['sqft_living', 'sqft_lot', 'sqft_above',
             'sqft_basement', 'sqft_living15', 'sqft_lot15']
ohe_cols = ['waterfront', 'view', 'condition', 'grade']

# Create the transformer for scaling and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('scale', StandardScaler(), scale_cols),
        ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ohe_cols)
    ])

In [39]:
# Preprocess the training data
X_train_prep = preprocessor.fit_transform(X_train)

# Create the regression model and fit the data
reg_model = LinearRegression()
reg_model.fit(X_train_prep, y_train)

# Preprocess the test data
X_test_prep = preprocessor.transform(X_test)

# Make predictions on the test data
y_pred = reg_model.predict(X_test_prep)

# Calculate the R-squared value
r_squared = reg_model.score(X_test_prep, y_test)
print('R-squared value:', r_squared)

R-squared value: 0.6417024847847104


In [40]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# preprocess the data
numerical_features = ['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built', 'lat', 'long']
categorical_features = ['waterfront', 'view', 'condition', 'grade']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(drop='first'), categorical_features)
])

X_train = preprocessor.fit_transform(X_train)
X_test_prep = preprocessor.transform(X_test)

# train an XGBoost regressor
model = XGBRegressor()
model.fit(X_train, y_train)

# make predictions on test set
y_pred = model.predict(X_test_prep)

# Calculate the R-squared value
r_squared = r2_score(y_test, y_pred)
print('R-squared value:', r_squared)

R-squared value: 0.892575977833152


In [None]:
# create a scatter plot with true prices in blue and predicted prices in orange
fig = go.Figure()
fig.add_trace(go.Scatter(x=y_test, y=y_test, mode='markers', name='True', opacity=0.5))
fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', name='Predicted', opacity=0.5))

# add trend lines
fig.add_trace(go.Scatter(x=y_test, y=np.poly1d(np.polyfit(y_test, y_test, 1))(y_test),
                    mode='lines', name='Trend (True)', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=y_test, y=np.poly1d(np.polyfit(y_test, y_pred, 1))(y_test),
                    mode='lines', name='Trend (Predicted)', line=dict(color='orange')))

# set axis labels and title
fig.update_layout(xaxis_title='True Price', yaxis_title='Predicted Price', title='True vs Predicted House Prices')

# show the plot
fig.show()

In [None]:
import plotly.figure_factory as ff

# calculate the correlation matrix
corr = df.corr()

# create the heatmap figure using Plotly
fig = ff.create_annotated_heatmap(
    z=corr.values.round(2),
    x=list(corr.columns),
    y=list(corr.index),
    colorscale='RdBu',
    showscale=True,
    reversescale=True
)

# update the layout
fig.update_layout(
    title='Correlation Matrix',
    xaxis=dict(title='Features'),
    yaxis=dict(title='Features')
)

# show the figure
fig.show()

In [None]:
# create a scatter plot with true prices in blue and predicted prices in orange
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_test['sqft_living'], y=y_test, mode='markers', name='True', opacity=0.5))
fig.add_trace(go.Scatter(x=X_test['sqft_living'], y=y_pred, mode='markers', name='Predicted', opacity=0.5))

# add trend lines
fig.add_trace(go.Scatter(x=X_test['sqft_living'], y=np.poly1d(np.polyfit(X_test['sqft_living'], y_test, 1))(X_test['sqft_living']),
                    mode='lines', name='Trend (True)', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=X_test['sqft_living'], y=np.poly1d(np.polyfit(X_test['sqft_living'], y_pred, 1))(X_test['sqft_living']),
                    mode='lines', name='Trend (Predicted)', line=dict(color='orange')))

# set axis labels and title
fig.update_layout(xaxis_title='sqft Living', yaxis_title='Price', title='True vs Predicted House Prices')

# show the plot
fig.show()

In [None]:
# create a scatter plot with true prices in blue and predicted prices in orange
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_test['sqft_above'], y=y_test, mode='markers', name='True', opacity=0.5))
fig.add_trace(go.Scatter(x=X_test['sqft_above'], y=y_pred, mode='markers', name='Predicted', opacity=0.5))

# add trend lines
fig.add_trace(go.Scatter(x=X_test['sqft_above'], y=np.poly1d(np.polyfit(X_test['sqft_above'], y_test, 1))(X_test['sqft_above']),
                    mode='lines', name='Trend (True)', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=X_test['sqft_above'], y=np.poly1d(np.polyfit(X_test['sqft_above'], y_pred, 1))(X_test['sqft_above']),
                    mode='lines', name='Trend (Predicted)', line=dict(color='orange')))

# set axis labels and title
fig.update_layout(xaxis_title='sqft Above', yaxis_title='Price', title='True vs Predicted House Prices')

# show the plot
fig.show()

In [None]:
# create a scatter plot with true prices in blue and predicted prices in orange
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_test['bathrooms'], y=y_test, mode='markers', name='True', opacity=0.5))
fig.add_trace(go.Scatter(x=X_test['bathrooms'], y=y_pred, mode='markers', name='Predicted', opacity=0.5))

# add trend lines
fig.add_trace(go.Scatter(x=X_test['bathrooms'].values, y=np.poly1d(np.polyfit(X_test['bathrooms'], y_test, 1))(X_test['bathrooms']),
                    mode='lines', name='Trend (True)', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=X_test['bathrooms'].values, y=np.poly1d(np.polyfit(X_test['bathrooms'], y_pred, 1))(X_test['bathrooms']),
                    mode='lines', name='Trend (Predicted)', line=dict(color='orange')))

# set axis labels and title
fig.update_layout(xaxis_title='Bathrooms', yaxis_title='Price', title='True vs Predicted House Prices')

# show the plot
fig.show()