In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../data/guland_hanoi_listings_arcgis2.csv")
figdir = "../data/figures/"

In [None]:
df.info()

In [None]:
num_cols_name = ['Price', 'Area', 'Latitude', 'Longitude', 'Price per m2']

In [None]:
plt.figure(figsize=(15, 10))

for i, col in enumerate(num_cols_name):
    plt.subplot(2, 3, i + 1)
    ax = sns.histplot(x=df[col], kde=True,)
    lines = ax.get_lines()
    lines[0].set_color('red')
    plt.title(f'Distribution of {col}')
    
plt.savefig(f"{figdir}num_cols_distribution.png", dpi=300)

In [None]:
correlation_matrix = df.select_dtypes(include=[np.number]).corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Hanoi Real Estate Dataset')
plt.show()
plt.savefig(f"{figdir}correlation_matrix.png", dpi=300)

In [None]:
num_cols_name = ['Price', 'Area', 'Longitude', 'Latitude', 'Price per m2']

plt.figure(figsize=(15, 10))
for i, col in enumerate(num_cols_name):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(x=df[col])
    plt.title(f'Distribution of {col}')

plt.savefig(f"{figdir}num_cols_boxplot.png", dpi=300)

In [None]:
min_x, max_x = df['Latitude'].min(), df['Latitude'].max()
min_y, max_y = df['Longitude'].min(), df['Longitude'].max()
median_x, median_y = df['Latitude'].median(), df['Longitude'].median()
mean_x, mean_y = df['Latitude'].mean(), df['Longitude'].mean()

resolution = 100

# x_range = ((df['Latitude'] - min_x) / (max_x - min_x) * (resolution - 1)).astype(int)
# y_range = ((df['Longitude'] - min_y) / (max_y - min_y) * (resolution - 1)).astype(int)

# heatmap_count = np.zeros((resolution, resolution))
# for x, y in zip(x_range, y_range):
#     heatmap_count[x, y] += 1

def classify_bin(max, min, resolution, value):
    return int((value - min) / (max - min) * (resolution - 1))

df['bin_x'] = np.vectorize(classify_bin)(max_x, min_x, resolution, df['Latitude'])
df['bin_y'] = np.vectorize(classify_bin)(max_y, min_y, resolution, df['Longitude'])


In [None]:
## Test vectorize and apply
import time

test_count = 1000

apply_time = 0

for i in range(test_count):
    start_time = time.time()
    df['bin_x'] = df['Latitude'].apply(lambda v: classify_bin(max_x, min_x, resolution, v))
    df['bin_y'] = df['Longitude'].apply(lambda v: classify_bin(max_y, min_y, resolution, v))
    apply_time += time.time() - start_time

vectorize_time = 0

for i in range(test_count):
    start_time = time.time()
    df['bin_x'] = np.vectorize(classify_bin)(max_x, min_x, resolution, df['Latitude'])
    df['bin_y'] = np.vectorize(classify_bin)(max_y, min_y, resolution, df['Longitude'])
    vectorize_time += time.time() - start_time

print(f"Apply time: {apply_time / test_count:.6f} seconds")
print(f"Vectorize time: {vectorize_time / test_count:.6f} seconds")

In [None]:
heatmap_count = np.zeros((resolution, resolution))
for x, y in zip(df['bin_x'], df['bin_y']):
    heatmap_count[x, y] += 1

# Plot the heatmap
plt.figure(figsize = (20,20))
sns.heatmap(heatmap_count)
sns.scatterplot(x=[median_x, mean_x], y=[median_y, mean_y], color=['green', 'cyan'], s=200, marker='X')
plt.title("Heatmap of Property Listings")
plt.xlabel("Latitude")
plt.ylabel("Longitude")
plt.legend(['Number of Listings'])
plt.show()

plt.savefig(f"{figdir}heatmap_property_listings.png", dpi=300)

In [None]:
heatmap_count = pd.DataFrame(heatmap_count)
print(np.argmax(heatmap_count.T.max()))

In [None]:
lat_x = np.linspace(min_x, max_x, resolution)
lon_y = np.linspace(min_y, max_y, resolution)

print(lat_x)

In [None]:
print(lat_x[57], lon_y[53])

In [None]:
max_prm2 = df['Price per m2'].max()

In [None]:
max_prm2_index = df.loc[df['Price per m2'] == max_prm2].index
df.loc[max_prm2_index, ['Latitude', 'Longitude']]


In [None]:
# Scatter plot: Price vs Location
plt.figure(figsize=(12, 8))
scatter = plt.scatter(df['Longitude'], df['Latitude'], c=df['Price'], 
                     cmap='viridis', s=30, alpha=0.6)
plt.colorbar(scatter, label='Price (VND)')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Property Prices by Geographic Location')
plt.savefig(f"{figdir}price_by_location.png", dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(20, 10))

plt.subplot(1, 2, 1)
plt.scatter(df['Area'][::5], df['Price'][::5], alpha=0.5)
plt.xlabel('Area (m²)')
plt.ylabel('Price (1M VND)')
plt.title('Price vs Area Relationship')

plt.subplot(1, 2, 2)
plt.scatter(df['Area'][::5], df['Price per m2'][::5], alpha=0.5)
plt.xlabel('Area (m²)')
plt.ylabel('Price per m2 (1M VND)')
plt.title('Price per m2 vs Area Relationship')

plt.savefig(f"{figdir}price_per_m2_vs_area.png", dpi=300)

In [None]:
# Price per m2 distribution by area ranges
df['Area_Range'] = pd.cut(df['Area'], bins=[0, 50, 100, 200, 500, 1000], labels=['<50m²', '50-100m²', '100-200m²', '200-500m²', '>500m²'])
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Area_Range', y='Price per m2')
plt.title('Price per m² by Property Size')
plt.xticks(rotation=45)
plt.savefig(f"{figdir}price_per_m2_by_size.png", dpi=300)
plt.show()

In [None]:
df.to_csv("../data/guland_hanoi_listings_arcgis3.csv", index=False)