# Exploratory Data Analysis

## Init: Prepare Packages and Configuration

In [None]:
# ! pip install libpysal
# ! pip install geopandas libpysal esda matplotlib



Collecting geopandas
  Downloading geopandas-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting libpysal
  Downloading libpysal-4.13.0-py3-none-any.whl.metadata (4.8 kB)
Collecting esda
  Downloading esda-2.7.0-py3-none-any.whl.metadata (2.0 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.3-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting numpy>=1.24 (from geopandas)
  Using cached numpy-2.2.6-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting pyogrio>=0.7.2 (from geopandas)
  Downloading pyogrio-0.11.0-cp311-cp311-win_amd64.whl.metadata (5.4 kB)
Collecting pandas>=2.0.0 (from geopandas)
  Using cached pandas-2.2.3-cp311-cp311-win_amd64.whl.metadata (19 kB)
Collecting pyproj>=3.5.0 (from geopandas)
  Downloading pyproj-3.7.1-cp311-cp311-win_amd64.whl.metadata (31 kB)
Collecting shapely>=2.0.0 (from geopandas)
  Downloading shapely-2.1.1-cp311-cp311-win_amd64.whl.metadata (7.0 kB)
Collecting beautifulsoup4>=4.10 (from libpysal)
  Downloading beautifulsoup4-4.13.4-py3-no

In [None]:
pip install notebook ipykernel
python -m ipykernel install --user --name=spatial_env --display-name "Python (spatial_env)"


In [2]:
# Cancel the comment to install all the packages and libraries needed.
! pip install rasterio matplotlib rasterstats ipynbname imageio tqdm rasterstats
! pip install numpy==1.24.4CURR_PATH

# Configuration
from pathlib import Path
import sys

CURR_PATH = Path().resolve()            # current file path
REPO_PATH = CURR_PATH.parent            # current repository path
DATA_PATH = REPO_PATH / "data"          # path for saving the data
DEMO_PATH = DATA_PATH / "demo-data"     # path for demo purpose 

SRC_PATH = REPO_PATH / "src"    # path for other sources
sys.path.append(str(SRC_PATH))  # add src to system path to import custom functions

# Import customised scripts
from aggregation import*
from missingvalue import*
from visualization import*

# print(REPO_PATH)

Collecting rasterio
  Using cached rasterio-1.4.3-cp311-cp311-win_amd64.whl.metadata (9.4 kB)
Collecting rasterstats
  Downloading rasterstats-0.20.0-py3-none-any.whl.metadata (4.2 kB)
Collecting ipynbname
  Downloading ipynbname-2024.1.0.0-py3-none-any.whl.metadata (1.9 kB)
Collecting imageio
  Using cached imageio-2.37.0-py3-none-any.whl.metadata (5.2 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting affine (from rasterio)
  Using cached affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting attrs (from rasterio)
  Downloading attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting click>=4.0 (from rasterio)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting cligj>=0.5 (from rasterio)
  Using cached cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Using cached click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Collecting fiona (from rasterstats)
  Downloading fiona-1.

ERROR: Invalid requirement: 'numpy==1.24.4CURR_PATH': Expected end or semicolon (after version specifier)
    numpy==1.24.4CURR_PATH
         ~~~~~~~~~^


## 1 Univariate Data Analysis

### 1.1 NO2 Spatial Analysis

In [7]:
import geopandas as gpd
import pandas as pd
import glob
import matplotlib.pyplot as plt
from libpysal.weights import Queen
from esda.moran import Moran, Moran_Local

# ---------- 参数设置 ----------
addis_gpkg_dir = DATA_PATH / "addis-mesh-data"  # GPKG 文件夹路径
no2_column = "no2_mean"        # no2 列名
simplify_geometry = False # 是否简化 geometry（可选）

# ---------- 步骤 1：合并所有 GPKG ----------
print("正在加载 GPKG 文件...")
files = addis_gpkg_dir.glob("*.gpkg")
gdfs = [gpd.read_file(f) for f in files]
merged_gdf = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))


SystemError: initialization of _internal failed without raising an exception

In [None]:

# ---------- 步骤 2：提取 no2 列并去除缺失值 ----------
print("清洗数据...")
no2_gdf = merged_gdf[['geometry', no2_column]].copy()
no2_gdf = no2_gdf.dropna(subset=[no2_column])
no2_gdf = no2_gdf.reset_index(drop=True)

# 可选：简化 geometry（加速邻接构建）
if simplify_geometry:
    no2_gdf['geometry'] = no2_gdf['geometry'].simplify(0.001)

# ---------- 步骤 3：构建 Queen 邻接矩阵 ----------
print("构建空间邻接矩阵（Queen 邻接）...")
w = Queen.from_dataframe(no2_gdf)
w.transform = 'r'  # row-standardized weights

# ---------- 步骤 4：计算全局 Moran’s I ----------
print("计算 Moran's I...")
moran = Moran(no2_gdf[no2_column], w)
print(f"Moran’s I: {moran.I:.4f}")
print(f"p-value (normal approximation): {moran.p_norm:.4f}")

# ---------- 步骤 5：计算局部 Moran’s I (LISA) ----------
print("计算局部自相关 (Moran_Local)...")
moran_local = Moran_Local(no2_gdf[no2_column], w)
no2_gdf['Is_Hotspot'] = moran_local.Is
no2_gdf['Local_I'] = moran_local.Is * moran_local.q  # 四象限编码（1=HH, 2=LH, 3=LL, 4=HL）

# ---------- 步骤 6：可视化 Hotspot Map ----------
print("绘制热点图...")
fig, ax = plt.subplots(1, 1, figsize=(10, 8))
no2_gdf.plot(column='Local_I', cmap='coolwarm', legend=True, ax=ax)
ax.set_title("局部空间自相关 (LISA): no2", fontsize=14)
ax.axis("off")
plt.tight_layout()
plt.show()


### 1.2 NO2 Temporal Analysis

## 2 Multivariate Data Analysis

### 2.1 Correlation Matrix