# PCA + Cluster Analysis for Apartment and Room Listings
This notebook performs PCA and clustering on sampled datasets.

## 1. Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

## 2. Load Datasets

In [None]:
df_apartment = pd.read_csv('/mnt/data/df_apartment_filtered.csv')
df_room = pd.read_csv('/mnt/data/df_room_filtered.csv')
df_apartment.head(), df_room.head()

## 3. Select Variables for PCA & Clustering

In [None]:
cols = ['price','minimum_nights','number_of_reviews','reviews_per_month','availability_365']

apt = df_apartment[cols].dropna()
room = df_room[cols].dropna()

apt_sample = apt.sample(min(5000, len(apt)), random_state=42)
room_sample = room.sample(min(5000, len(room)), random_state=42)

apt_sample.head(), room_sample.head()

## 4. Standardize Data

In [None]:
scaler = StandardScaler()
apt_scaled = scaler.fit_transform(apt_sample)
room_scaled = scaler.fit_transform(room_sample)

## 5. PCA (2 Components for Visualization)

In [None]:
pca = PCA(n_components=2)
apt_pca = pca.fit_transform(apt_scaled)
room_pca = pca.fit_transform(room_scaled)
apt_pca[:5], room_pca[:5]

## 6. KMeans Clustering (k = 3)

In [None]:
k = 3
apt_kmeans = KMeans(n_clusters=k, n_init=10).fit(apt_pca)
room_kmeans = KMeans(n_clusters=k, n_init=10).fit(room_pca)

apt_labels = apt_kmeans.labels_
room_labels = room_kmeans.labels_

apt_kmeans.cluster_centers_, room_kmeans.cluster_centers_

## 7. PCA Cluster Scatter Plots

In [None]:
plt.figure(figsize=(6,5))
plt.scatter(apt_pca[:,0], apt_pca[:,1], c=apt_labels)
plt.title("Apartment Listings – PCA Clusters (Sampled)")
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout()
plt.show()

plt.figure(figsize=(6,5))
plt.scatter(room_pca[:,0], room_pca[:,1], c=room_labels)
plt.title("Room Listings – PCA Clusters (Sampled)")
plt.xlabel("PC1"); plt.ylabel("PC2")
plt.tight_layout()
plt.show()