In [3]:
import pandas as pd


# --- Load data ---
hcm_hourly = pd.read_excel("/Users/user/Desktop/HCM/HCMWeatherHourly.xlsx")
hcm_daily = pd.read_excel("/Users/user/Desktop/HCM/HCMWeatherDaily.xlsx")
hn_daily = pd.read_csv("/Users/user/Desktop/HN/Hanoi Daily.csv")
hn_hourly = pd.read_csv("/Users/user/Desktop/HN/Hanoi Hourly.csv")

In [4]:


# --- Cleaning and normalization function ---
def clean_dataset(df):
    df = df.copy()

    # 1. Convert datetime and set as index
    if 'datetime' in df.columns:
        df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
        df = df.set_index('datetime')

    # 2. Convert likely categorical columns to 'category' dtype
    categorical_cols = [col for col in df.columns if col in [
        'name', 'conditions', 'icon', 'preciptype', 'stations', 'description'
    ]]
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')

    return df

In [5]:
# --- Apply cleaning ---
hcm_hourly_clean = clean_dataset(hcm_hourly)
hcm_daily_clean = clean_dataset(hcm_daily)
hn_daily_clean = clean_dataset(hn_daily)
hn_hourly_clean = clean_dataset(hn_hourly)


In [6]:
# --- Function to summarize data types ---
def summarize_types(df, name):
    summary = pd.DataFrame(df.dtypes, columns=["DataType"])
    summary["Non-Null Count"] = df.notnull().sum()
    summary["Category"] = summary["DataType"].apply(
        lambda x: "Categorical" if str(x) == "category" else
                  "Datetime" if str(x).startswith("datetime") else
                  "Numerical" if "float" in str(x) or "int" in str(x) else
                  "Other"
    )
    summary["Dataset"] = name
    return summary.reset_index().rename(columns={"index": "Column"})

In [7]:
# --- Combine all summaries ---
summaries = pd.concat([
    summarize_types(hcm_hourly_clean, "HCM Hourly"),
    summarize_types(hcm_daily_clean, "HCM Daily"),
    summarize_types(hn_daily_clean, "Hanoi Daily"),
    summarize_types(hn_hourly_clean, "Hanoi Hourly"),
], ignore_index=True)

In [8]:
# --- Print summary ---
print("=== SUMMARY OF DATA TYPES AFTER CLEANING ===")
print(summaries.head(30))

=== SUMMARY OF DATA TYPES AFTER CLEANING ===
              Column  DataType  Non-Null Count     Category     Dataset
0               name  category           94434  Categorical  HCM Hourly
1               temp   float64           94434    Numerical  HCM Hourly
2          feelslike   float64           94434    Numerical  HCM Hourly
3                dew   float64           94434    Numerical  HCM Hourly
4           humidity   float64           94434    Numerical  HCM Hourly
5             precip   float64           94395    Numerical  HCM Hourly
6         precipprob     int64           94434    Numerical  HCM Hourly
7         preciptype  category           12667  Categorical  HCM Hourly
8               snow   float64           94392    Numerical  HCM Hourly
9          snowdepth   float64           94392    Numerical  HCM Hourly
10          windgust   float64           94385    Numerical  HCM Hourly
11         windspeed   float64           94434    Numerical  HCM Hourly
12           winddi

In [9]:
# --- Data quality checking ---
def check_data_quality(df, name):
    print(f"\n===== {name} =====")
    print("Missing values per column:")
    print(df.isna().sum())
    print(f"\nNumber of duplicated rows: {df.duplicated().sum()}")
    print("\nDescriptive statistics (first 5 variables):")
    print(df.describe().head())

In [10]:
# --- Run data quality checks ---
for dataset, name in [
    (hcm_hourly_clean, "HCM Hourly"),
    (hcm_daily_clean, "HCM Daily"),
    (hn_daily_clean, "Hanoi Daily"),
    (hn_hourly_clean, "Hanoi Hourly"),
]:
    check_data_quality(dataset, name)

print("\nData diagnosis and cleaning completed!")


===== HCM Hourly =====
Missing values per column:
name                    0
temp                    0
feelslike               0
dew                     0
humidity                0
precip                 39
precipprob              0
preciptype          81767
snow                   42
snowdepth              42
windgust               49
windspeed               0
winddir                10
sealevelpressure        0
cloudcover              0
visibility            198
solarradiation         36
solarenergy            36
uvindex                36
severerisk          61603
conditions              0
icon                    0
stations                0
dtype: int64

Number of duplicated rows: 79

Descriptive statistics (first 5 variables):
               temp     feelslike           dew      humidity        precip  \
count  94434.000000  94434.000000  94434.000000  94434.000000  94395.000000   
mean      28.448569     31.648035     23.494331     76.545719      0.217408   
std        2.967828      