# Exploratory Data Analysis (EDA)
This notebook performs basic EDA on the `dfcountries.csv` dataset located in the `data` folder.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 数据文件路径
data_path = "../data/dfcountries.csv"  # 相对路径，指向 data 文件夹中的数据文件

# Step 1: 加载数据
try:
    data = pd.read_csv(data_path)
    print("Dataset loaded successfully!")
    print("\nFirst 5 Rows of the Dataset:")
    print(data.head())
except FileNotFoundError:
    print(f"Error: File not found at {data_path}. Please check the file path and name.")

In [None]:
# Step 2: 查看数据基本信息
print("\nDataset Information:")
print(data.info())

# Step 3: 检查缺失值
print("\nMissing Values:")
print(data.isnull().sum())

In [None]:
# Step 4: 基本统计信息
print("\nSummary Statistics (for numeric columns):")
print(data.describe())

In [None]:
# Step 5: 数据可视化

# 5.1 某一数值列的分布 (经度 longitude)
if 'longitude' in data.columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(data['longitude'].dropna(), kde=True)
    plt.title("Distribution of Longitude")
    plt.xlabel("Longitude")
    plt.ylabel("Frequency")
    plt.show()

# 5.2 某一数值列的分布 (纬度 latitude)
if 'latitude' in data.columns:
    plt.figure(figsize=(8, 5))
    sns.histplot(data['latitude'].dropna(), kde=True)
    plt.title("Distribution of Latitude")
    plt.xlabel("Latitude")
    plt.ylabel("Frequency")
    plt.show()

# 5.3 按地区分类统计收入水平 (incomeLevel.value)
if 'region_value' in data.columns and 'incomeLevel.value' in data.columns:
    plt.figure(figsize=(12, 6))
    region_income_counts = data.groupby(['region_value', 'incomeLevel.value']).size().unstack()
    region_income_counts.plot(kind="bar", stacked=True, figsize=(12, 6))
    plt.title("Income Levels by Region")
    plt.xlabel("Region")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.legend(title="Income Level", loc="upper right")
    plt.show()

# Step 6: 相关性分析
numeric_data = data.select_dtypes(include=["float64", "int64"])
if not numeric_data.empty:
    plt.figure(figsize=(10, 8))
    sns.heatmap(numeric_data.corr(), annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Heatmap (Numeric Data)")
    plt.show()
else:
    print("No numeric columns found for correlation analysis.")