In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

In [None]:
df = pd.read_csv("../data/guland_hanoi_listings_arcgis2.csv")
figdir = "../data/figures/"

In [None]:
df.info()

In [None]:
# def distance_from_point(lat1, lon1, lat2, lon2):
#     return np.sqrt((lat1 - lat2)**2 + (lon1 - lon2)**2)
badinh = [21.0372556, 105.8353794]

def distance_from_point(lat1, lon1, lat2, lon2):
    # 1 degree latitude ≈ 111 km
    # 1 degree longitude ≈ 111 km * cos(mean latitude)
    mean_lat = np.deg2rad((lat1 + lat2) / 2)
    dlat_km = (lat1 - lat2) * 111
    dlon_km = (lon1 - lon2) * 111 * np.cos(mean_lat)
    return np.sqrt(dlat_km**2 + dlon_km**2)

min_x, max_x = df['Longitude'].min(), df['Longitude'].max()
min_y, max_y = df['Latitude'].min(), df['Latitude'].max()
median_x, median_y = df['Longitude'].median(), df['Latitude'].median()
mean_x, mean_y = df['Longitude'].mean(), df['Latitude'].mean()

resolution = 100

lat_centre = badinh[0]
lon_centre = badinh[1]
df['distance_from_center'] = np.vectorize(distance_from_point)(df['Latitude'], df['Longitude'], lat_centre, lon_centre)

In [None]:
print(median_y, median_x)

In [None]:
num_cols_name = ['Price', 'Area', 'Latitude', 'Longitude', 'Price per m2', 'distance_from_center']

In [None]:
plt.figure(figsize=(15, 10))

for i, col in enumerate(num_cols_name):
    plt.subplot(2, 3, i + 1)
    ax = sns.histplot(x=df[col], kde=True,)
    lines = ax.get_lines()
    lines[0].set_color('red')
    plt.title(f'Distribution of {col}')
    
plt.savefig(f"{figdir}num_cols_distribution.png", dpi=300)
plt.savefig(f"{figdir}num_cols_distribution.svg")
plt.show()

In [None]:
# x_range = ((df['Longitude'] - min_x) / (max_x - min_x) * (resolution - 1)).astype(int)
# y_range = ((df['Latitude'] - min_y) / (max_y - min_y) * (resolution - 1)).astype(int)

# heatmap_count = np.zeros((resolution, resolution))
# for x, y in zip(x_range, y_range):
#     heatmap_count[x, y] += 1

def classify_bin(max, min, resolution, value):
    return int((value - min) / (max - min) * (resolution - 1))

df['bin_x'] = np.vectorize(classify_bin)(max_x, min_x, resolution, df['Longitude'])
df['bin_y'] = np.vectorize(classify_bin)(max_y, min_y, resolution, df['Latitude'])

In [None]:
correlation_matrix = df[num_cols_name].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Hanoi Real Estate Dataset')
plt.savefig(f"{figdir}correlation_matrix.png", dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(15, 10))
for i, col in enumerate(num_cols_name):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(x=df[col])
    plt.title(f'Distribution of {col}')

plt.savefig(f"{figdir}num_cols_boxplot.png", dpi=300)
plt.show()

In [None]:
print(min_x, max_x, min_y, max_y)

In [None]:
heatmap_count = np.zeros((resolution, resolution))
for y, x in zip(df['bin_x'], df['bin_y']):
    heatmap_count[x, y] += 1

# Plot the heatmap
plt.figure(figsize = (20,20))
plt.imshow(heatmap_count, extent=[min_x, max_x, min_y, max_y], aspect='auto', cmap='inferno', interpolation='nearest')
plt.title("Heatmap of Property Listings")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.colorbar(label='Number of Listings')
plt.savefig(f"{figdir}heatmap_property_listings.png", dpi=300)
plt.show()

In [None]:
heatmap_count = pd.DataFrame(heatmap_count)
print(np.argmax(heatmap_count.T.max()))

In [None]:
lon_x = np.linspace(min_x, max_x, resolution)
lat_y = np.linspace(min_y, max_y, resolution)

In [None]:
hanoi_map = cv2.imread('../data/hanoi_map.png')

In [None]:
pricem2_heatmap = np.zeros((resolution, resolution))

for i in range(resolution):
    for j in range(resolution):
        df_bin = df[(df['bin_x'] == i) & (df['bin_y'] == j)]
        if len(df_bin) > 0:
            pricem2_heatmap[resolution-j-1, i] = df_bin['Price per m2'].mean()
        else:
            pricem2_heatmap[resolution-j-1, i] = 0

# Plot the heatmap
plt.figure(figsize = (20,20))
# sns.heatmap(pricem2_heatmap)
# plt.imshow(hanoi_map, extent=[105.66, 105.96, 20.9, 21.13], aspect='auto')
plt.imshow(pricem2_heatmap, extent=[min_x, max_x, min_y, max_y], aspect='auto', cmap='inferno', interpolation='nearest')
plt.title("Heatmap of Price per m2")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.colorbar(label='Price per m2')

plt.savefig(f"{figdir}heatmap_price_per_m2.png", dpi=300)
plt.show()


In [None]:
print(pricem2_heatmap.shape)

max_idx = np.argmax(pricem2_heatmap)
row, col = np.unravel_index(max_idx, pricem2_heatmap.shape)
print(row, col)
print('Coordinates of highest average Price per m2:')
print(lon_x[col], lat_y[resolution - row - 1])
print(lon_x[col+1], lat_y[resolution - row])

print("The centre is")
print((lon_x[col] + lon_x[col+1]) / 2, (lat_y[resolution - row - 1] +lat_y[resolution - row]) /2)

In [None]:
# Filter cells with a minimum number of data points
min_points = 1  # Adjust this threshold as needed
filtered_heatmap = pricem2_heatmap.copy()

for i in range(resolution):
    for j in range(resolution):
        df_bin = df[(df['bin_x'] == i) & (df['bin_y'] == j)]
        if len(df_bin) < min_points:
            filtered_heatmap[resolution-j-1, i] = 0

max_idx = np.argmax(filtered_heatmap)
row, col = np.unravel_index(max_idx, filtered_heatmap.shape)

# Now print information about this cell
print("Highest average Price per m2:", filtered_heatmap[row, col])
print("Number of data points in this cell:", len(df[(df['bin_x'] == col) & (df['bin_y'] == resolution - row - 1)]))
print("Center coordinates:", (lat_y[resolution - row - 1] + lat_y[resolution - row])/2, (lon_x[col] + lon_x[col+1])/2)

In [None]:
# 1. Identify the bin coordinates from your analysis
bin_x = col  # From your max_idx calculation
bin_y = resolution - row - 1  # Based on your code

# 2. Find the exact listing in this bin
high_price_listing = df[(df['bin_x'] == bin_x) & (df['bin_y'] == bin_y)]

# 3. Display details of the listing
print(high_price_listing['Link'].iloc[0])
high_price_listing[['Longitude', 'Latitude', 'Price per m2', 'Link', 'Address']]

In [None]:
print(df.sort_values(by='Price per m2', ascending=False)[['Longitude','Latitude','Price per m2']].head(1))

In [None]:
# plt.figure(figsize=(11,10))
# plt.scatter(df['Longitude'], df['Latitude'], s=df['Price per m2']/6, alpha=0.3, label = "Trieu", c=df['Price per m2'], cmap="jet")

plt.figure(figsize=(11,10))
plt.scatter(df['Longitude'], df['Latitude'], s=df['Price per m2']/6, alpha=0.3, label = "Trieu", c=df['distance_from_center'], cmap="inferno")



In [None]:
hanoi_map = cv2.imread('../data/figures/hanoi_map.png')

# df = df.sort_values(by='Price per m2', ascending=True)

plt.figure(figsize=(11,10), dpi=300)
plt.imshow(hanoi_map, extent=[105.66, 105.96, 20.9, 21.13], aspect='auto')
plt.scatter(df['Longitude'], df['Latitude'], s=df['Price per m2']/6, alpha=0.3, label = "Trieu", c=df['Price per m2'], cmap="jet")
plt.title("Scatter plot of Price per m2 on Hanoi map")
plt.savefig(f"{figdir}scatter_price_per_m2_on_map.png", dpi=300)
plt.show()

In [None]:
hanoi_map = cv2.imread('../data/figures/hanoi_map.png')

df_top100_pm2 = df.groupby(['Latitude', 'Longitude'], as_index=False).apply(
    lambda x: x.loc[x['Price per m2'].idxmax()]
).sort_values(by='Price per m2', ascending=False).head(100)

plt.figure(figsize=(11,10), dpi=300)
plt.imshow(hanoi_map, extent=[105.66, 105.96, 20.9, 21.13], aspect='auto')
plt.scatter(df_top100_pm2['Longitude'], df_top100_pm2['Latitude'], s=df_top100_pm2['Price per m2']/6, alpha=0.9, label = "Trieu", c=df_top100_pm2['Price per m2'], cmap="jet")
plt.title("Scatter plot of 100 highest Price per m2 on Hanoi map")
plt.savefig(f"{figdir}scatter_price_per_m2_on_map.png", dpi=300)
plt.show()

In [None]:
plt.clf()
plt.figure(figsize=(11,10), dpi=300)
plt.imshow(hanoi_map, extent=[105.66, 105.96, 20.9, 21.13], aspect='auto')
plt.hexbin(df['Longitude'], df['Latitude'], C=df['Price per m2'], gridsize=50, cmap='inferno', alpha=0.8, reduce_C_function=np.mean)
plt.title("Hexbin of Price per m2 on Hanoi map")
plt.savefig(f"{figdir}hexbin_price_per_m2_on_map.png", dpi=300)
plt.show()

In [None]:
# Get frequency count of each unique Longitude-Latitude pair
location_counts = df.groupby(['Longitude', 'Latitude']).size().reset_index(name='count')

# Sort by frequency (descending)
location_counts = location_counts.sort_values('count', ascending=False)

# Display the most frequently occurring locations
print(location_counts.head(10))  # Top 10 most frequent locations

# See how many locations occur more than once
print(f"Points that appear multiple times: {(location_counts['count'] > 1).sum()}")

In [None]:
## Check the link of the most frequent location
most_frequent_location = location_counts.iloc[0]
lat, lon = most_frequent_location['Latitude'], most_frequent_location['Longitude']
df[(df['Latitude'] == lat) & (df['Longitude'] == lon)]['Link'].head(10).values

In [None]:
# Price per m2 distribution by area ranges
df['Area_Range'] = pd.cut(df['Area'], bins=[0, 50, 100, 200, 500, 1000], labels=['<50m²', '50-100m²', '100-200m²', '200-500m²', '>500m²'])
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Area_Range', y='Price per m2')
plt.title('Price per m² by Property Size')
plt.xticks(rotation=45)
plt.savefig(f"{figdir}price_per_m2_by_size.png", dpi=300)
plt.show()

In [None]:
sns.pairplot(
    df[num_cols_name],
    plot_kws={'s': 10, 'alpha': 0.3},  # smaller, more transparent points
    diag_kws={'fill': True}
)
plt.gcf().set_size_inches(18, 18)  # even larger figure
plt.tight_layout()
plt.show()

In [None]:
df.to_csv("../data/guland_hanoi_listings_arcgis3.csv", index=False)