In [1]:
import pandas as pd


# --- Load data ---
hn_daily = pd.read_csv("/Users/user/Desktop/HN/Hanoi Daily.csv")
hn_hourly = pd.read_csv("/Users/user/Desktop/HN/Hanoi Hourly.csv")

In [2]:


# --- Cleaning and normalization function ---
def clean_dataset(df):
    df = df.copy()

    # 1. Convert datetime and set as index
    if 'datetime' in df.columns:
        df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
        df = df.set_index('datetime')

    # 2. Convert likely categorical columns to 'category' dtype
    categorical_cols = [col for col in df.columns if col in [
        'name', 'conditions', 'icon', 'preciptype', 'stations', 'description'
    ]]
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].astype('category')

    return df

In [3]:
# --- Apply cleaning ---
hn_daily_clean = clean_dataset(hn_daily)
hn_hourly_clean = clean_dataset(hn_hourly)


In [4]:
# --- Function to summarize data types ---
def summarize_types(df, name):
    summary = pd.DataFrame(df.dtypes, columns=["DataType"])
    summary["Non-Null Count"] = df.notnull().sum()
    summary["Category"] = summary["DataType"].apply(
        lambda x: "Categorical" if str(x) == "category" else
                  "Datetime" if str(x).startswith("datetime") else
                  "Numerical" if "float" in str(x) or "int" in str(x) else
                  "Other"
    )
    summary["Dataset"] = name
    return summary.reset_index().rename(columns={"index": "Column"})

In [5]:
# --- Combine all summaries ---
summaries = pd.concat([
    summarize_types(hn_daily_clean, "Hanoi Daily"),
    summarize_types(hn_hourly_clean, "Hanoi Hourly"),
], ignore_index=True)

In [6]:
# --- Print summary ---
print("=== SUMMARY OF DATA TYPES AFTER CLEANING ===")
print(summaries.head(30))

=== SUMMARY OF DATA TYPES AFTER CLEANING ===
              Column  DataType  Non-Null Count     Category      Dataset
0               name  category            3927  Categorical  Hanoi Daily
1            tempmax   float64            3927    Numerical  Hanoi Daily
2            tempmin   float64            3927    Numerical  Hanoi Daily
3               temp   float64            3927    Numerical  Hanoi Daily
4       feelslikemax   float64            3927    Numerical  Hanoi Daily
5       feelslikemin   float64            3927    Numerical  Hanoi Daily
6          feelslike   float64            3927    Numerical  Hanoi Daily
7                dew   float64            3927    Numerical  Hanoi Daily
8           humidity   float64            3927    Numerical  Hanoi Daily
9             precip   float64            3927    Numerical  Hanoi Daily
10        precipprob     int64            3927    Numerical  Hanoi Daily
11       precipcover   float64            3927    Numerical  Hanoi Daily
12    

In [7]:
# --- Data quality checking ---
def check_data_quality(df, name):
    print(f"\n===== {name} =====")
    print("Missing values per column:")
    print(df.isna().sum())
    print(f"\nNumber of duplicated rows: {df.duplicated().sum()}")
    print("\nDescriptive statistics (first 5 variables):")
    print(df.describe().head())

In [8]:
# --- Run data quality checks ---
for dataset, name in [
    (hn_daily_clean, "Hanoi Daily"),
    (hn_hourly_clean, "Hanoi Hourly"),
]:
    check_data_quality(dataset, name)

print("\nData diagnosis and cleaning completed!")


===== Hanoi Daily =====
Missing values per column:
name                   0
tempmax                0
tempmin                0
temp                   0
feelslikemax           0
feelslikemin           0
feelslike              0
dew                    0
humidity               0
precip                 0
precipprob             0
precipcover            0
preciptype          1042
snow                   0
snowdepth              0
windgust               0
windspeed              0
winddir                0
sealevelpressure       0
cloudcover             0
visibility             0
solarradiation         0
solarenergy            0
uvindex                0
severerisk          2566
sunrise                0
sunset                 0
moonphase              0
conditions             0
description            0
icon                   0
stations               0
dtype: int64

Number of duplicated rows: 0

Descriptive statistics (first 5 variables):
           tempmax      tempmin         temp  feelslikemax  