In [1]:
import pandas as pd
import numpy as np

In [3]:
# 1. Load the data
df = pd.read_csv(r"C:\Users\SunnyVanderwall\Documents\time_series_summer_ml\merged_weater_features_eng")

# 2. Show shape and columns
print("\n--- SHAPE ---")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")
print("\n--- COLUMNS ---")
print(df.columns.tolist())


--- SHAPE ---
Rows: 48192, Columns: 16

--- COLUMNS ---
['Date', 'Time (UTC)', 'Air Temperature', 'Air Temperature Quality', 'Precipitation Amount', 'Precipitation Amount Quality', 'Wind Direction', 'Wind Direction Quality', 'Wind Speed', 'Wind Speed Quality', 'Relative Humidity', 'Relative Humidity Quality', 'Global Irradiance (Swedish stations) W/m²', 'Global Irradiance Quality', 'Sunshine Duration (s)', 'Sunshine Duration Quality']


In [4]:
# 3. Show first and last 5 rows (peek for weirdness)
print("\n--- FIRST 5 ROWS ---")
print(df.head())


--- FIRST 5 ROWS ---
         Date Time (UTC)  Air Temperature Air Temperature Quality  \
0  2020-01-01   00:00:00              2.0                       G   
1  2020-01-01   01:00:00              2.2                       G   
2  2020-01-01   02:00:00              2.2                       G   
3  2020-01-01   03:00:00              1.6                       G   
4  2020-01-01   04:00:00              1.5                       G   

   Precipitation Amount Precipitation Amount Quality  Wind Direction  \
0                   0.0                            G           250.0   
1                   0.0                            G           250.0   
2                   0.0                            G           240.0   
3                   0.0                            G           240.0   
4                   0.0                            G           230.0   

  Wind Direction Quality  Wind Speed Wind Speed Quality  Relative Humidity  \
0                      G         3.0                

In [5]:
print("\n--- LAST 5 ROWS ---")
print(df.tail())


--- LAST 5 ROWS ---
             Date Time (UTC)  Air Temperature Air Temperature Quality  \
48187  2025-06-30   19:00:00             20.9                       G   
48188  2025-06-30   20:00:00             18.7                       G   
48189  2025-06-30   21:00:00             14.8                       G   
48190  2025-06-30   22:00:00             14.3                       G   
48191  2025-06-30   23:00:00             12.7                       G   

       Precipitation Amount Precipitation Amount Quality  Wind Direction  \
48187                   0.0                            G           290.0   
48188                   0.0                            G            60.0   
48189                   0.0                            G           120.0   
48190                   0.0                            G           120.0   
48191                   0.0                            G           130.0   

      Wind Direction Quality  Wind Speed Wind Speed Quality  \
48187               

In [6]:
# 4. Duplicates and missing keys
dupes = df.duplicated(subset=["Date", "Time (UTC)"]).sum()
print(f"\n--- DUPLICATE ROWS on ['Date', 'Time (UTC)']: {dupes}")


--- DUPLICATE ROWS on ['Date', 'Time (UTC)']: 0


In [None]:
# 5. Create datetime column for further analysis
# Concatenate the values from the Date and Time columns.
# Set the errors parameter to 'coerce' in order to avoid throwing errors and instead adding a missing value.
df["Datetime"] = pd.to_datetime(df["Date"] + " " + df["Time (UTC)"], errors="coerce")
missing_dt = df["Datetime"].isna().sum()
print(f"Datetime parse errors: {missing_dt}")

Datetime parse errors: 0


In [18]:
# 6. Check time step coverage 

df = df.sort_values("Datetime")

# Using pd.date_range, create an index object containing every hour we expect from the 
# first date to the last date in our dataset.
# (changed freq='H' to freq='h' as 'H' is deprecated from Pandas)
expected_hours = pd.date_range(df["Datetime"].min(), df["Datetime"].max(), freq="h")
actual_hours = pd.to_datetime(df["Datetime"].dropna().unique())
missing_times = set(expected_hours) - set(actual_hours)
print(f"Expected time steps: {len(expected_hours)}, Actual: {len(actual_hours)}, Missing: {len(missing_times)}")
if len(missing_times) < 30:
    print("Missing times (if few):", sorted(missing_times))



Expected time steps: 48192, Actual: 48192, Missing: 0
Missing times (if few): []


In [None]:
# 7. Check for NaNs and value ranges for all metric columns
print("\n--- NaN Percentage by Column ---")
print(df.isna().mean().sort_values(ascending=False).round(4) * 100)


In [None]:
# 8. Quick plausible range checks for main columns (adjust as needed!)
checks = [
    ("Lufttemperatur", -50, 50),
    ("Nederbördsmängd", 0, 200),
    ("Vindriktning", 0, 360),
    ("Vindhastighet", 0, 60),
    ("Relativ Luftfuktighet", 0, 100),
    ("Global Irradians (svenska stationer) W/m²", 0, 1500),
    ("Solskenstid (s)", 0, 3600),
]
for col, vmin, vmax in checks:
    if col in df.columns:
        v = df[col]
        print(f"\n{col}: min={v.min()}, max={v.max()}, <{vmin}={(v<vmin).sum()}, >{vmax}={(v>vmax).sum()}")



In [None]:
# 9. Show unique Kvalitet codes for each metric
print("\n--- Unique Kvalitet codes ---")
for c in df.columns:
    if "Kvalitet" in c:
        print(f"{c}: {df[c].unique()}")

# 10. Show a random sample of 5 rows
print("\n--- RANDOM SAMPLE ---")
print(df.sample(5, random_state=42))