<a href="https://colab.research.google.com/github/wikey/2025dataproject/blob/main/DA_johncooke_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# --- Bar Chart: Listing Counts by Neighbourhood Group ---
if 'neighbourhood_group' in df.columns:
    plt.figure(figsize=(8,5))
    sns.countplot(x='neighbourhood_group', data=df, order=df['neighbourhood_group'].value_counts().index)
    plt.title("Listing Counts by Neighbourhood Group")
    plt.xlabel("Neighbourhood Group")
    plt.ylabel("Count")
    plt.show()
else:
    print("Column 'neighbourhood_group' not found in DataFrame.")


In [None]:
# --- Correlation Matrix and Heatmap ---
corr_matrix = df[['price','availability_365','days_since_last_review','description_length']].corr()

print("Correlation Matrix:\n", corr_matrix)

plt.figure(figsize=(6,5))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title("Correlation Heatmap (Numeric Features)")
plt.show()

In [None]:
# --- Histograms: Price (raw) and log(1+price) ---
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df['log_price'] = np.log1p(df['price'])

fig, axes = plt.subplots(1, 2, figsize=(12,5))

sns.histplot(df['price'], bins=50, ax=axes[0])
axes[0].set_title("Raw Price Distribution")

sns.histplot(df['log_price'], bins=50, ax=axes[1])
axes[1].set_title("Log(1+Price) Distribution")

plt.tight_layout()
plt.show()

In [None]:
# --- Numerical Summary of Price and Other Numeric Features ---
import pandas as pd

numeric_summary = df[['price','availability_365','days_since_last_review','description_length']].describe()
numeric_summary

In [None]:
# --- OLS Regression: log(1+price) with numeric + categorical predictors ---
import numpy as np
import statsmodels.formula.api as smf

# Target variable
df['log_price'] = np.log1p(df['price'])

# Formula: numeric features + one-hot encoded categorical features
# statsmodels handles one-hot encoding automatically with C()
formula = "log_price ~ availability_365 + days_since_last_review + description_length + C(room_type) + C(neighbourhood_group)"

# Fit OLS model with robust HC3 standard errors
ols_model = smf.ols(formula=formula, data=df).fit(cov_type='HC3')

# Display full summary
print(ols_model.summary())


In [None]:
# --- OLS Regression Predicting Log(1+Price) ---
import statsmodels.formula.api as smf

# Example formula: include numeric + categorical predictors
formula = "log_price ~ availability_365 + days_since_last_review + description_length + C(room_type) + C(neighbourhood_group)"

ols_model = smf.ols(formula=formula, data=df).fit(cov_type='HC3')

print(ols_model.summary())

In [None]:
# --- PCA on Scaled Numeric Features ---
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Select numeric features (include derived columns if present)
numeric_cols = ['price','availability_365','days_since_last_review','description_length']
X = df[numeric_cols].fillna(0)

# Scale numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Explained variance table
explained_variance = pd.DataFrame({
    'PC': [f'PC{i+1}' for i in range(len(pca.explained_variance_ratio_))],
    'Explained Variance Ratio': pca.explained_variance_ratio_
})
print("Explained Variance Ratios:\n", explained_variance)

# Scatter plot PC1 vs PC2 colored by neighbourhood_group
plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=df['neighbourhood_group'], palette='Set2', alpha=0.7)
plt.title("PCA: PC1 vs PC2 (colored by neighbourhood_group)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend(title="Neighbourhood Group")
plt.show()
