# Load TOBS 2025 Data with Dask

This notebook demonstrates how to load and work with the TOBS 2025 wide-format parquet file using Dask.


In [1]:
# Import libraries
import dask.dataframe as dd
import pandas as pd
import numpy as np

print('âœ“ Libraries imported')


âœ“ Libraries imported


## Method 1: Basic Loading with Dask


In [2]:
# Read the parquet file with Dask
df_dask = dd.read_parquet('tobs_2025_wide.parquet')

# Dask operations are lazy - this doesn't load the data yet
print('Dask DataFrame Info:')
print(f'  Type: {type(df_dask)}')
print(f'  Columns: {len(df_dask.columns)} ({list(df_dask.columns[:5])} ... {list(df_dask.columns[-3:])})')
print(f'  Number of partitions: {df_dask.npartitions}')
print(f'  Partitions: {df_dask.to_delayed()}')

# To see the structure without computing
print(f'\nDataframe structure (lazy):')
print(df_dask)


FileNotFoundError: An error occurred while calling the read_parquet method registered to the pandas backend.
Original Message: /home/yfreund/dask-CSE255/weather/tobs_2025_wide.parquet

## Viewing Data (Triggers Computation)


In [3]:
# View first few rows (only loads necessary partitions)
print('First 5 rows, first 10 columns:')
first_10_cols = list(df_dask.columns[:10])
print(df_dask[first_10_cols].head())

print('\n\nBasic statistics:')
print(f'Number of rows (computed): {len(df_dask)}')
print(f'Unique stations: {df_dask["station_id"].nunique().compute()}')


First 5 rows, first 10 columns:
    station_id  year  day_1  day_2  day_3  day_4  day_5  day_6  day_7  day_8
0  CQC00914801  2025  250.0  244.0  250.0  250.0  256.0  250.0  256.0  244.0
1  FMC00914325  2025    NaN  294.0  283.0  283.0  256.0  294.0  294.0  272.0
2  FMC00914395  2025  300.0  278.0  306.0  294.0  283.0  311.0  300.0  306.0
3  FMC00914590  2025  211.0  250.0  300.0  228.0  289.0  294.0  278.0  239.0
4  FMC00914720  2025    NaN  283.0  289.0  278.0  289.0  289.0  283.0  289.0


Basic statistics:
Number of rows (computed): 4706
Unique stations: 4706


## Dask Operations (Lazy Evaluation)


In [4]:
# Example: Calculate mean temperature for each day across all stations
# This is lazy - builds the computation graph but doesn't execute yet

day_cols = [col for col in df_dask.columns if col.startswith('day_')]
print(f'Number of day columns: {len(day_cols)}')

# Calculate mean for first 10 days (lazy)
means_lazy = df_dask[day_cols[:10]].mean()
print(f'\nMean computation (lazy): {means_lazy}')

# Now compute the result
means = means_lazy.compute()
print(f'\nMean temperatures for first 10 days (in tenths of Â°C):')
for col, val in means.items():
    print(f'  {col}: {val:.1f} (= {val/10:.1f}Â°C)')


Number of day columns: 365

Mean computation (lazy): Dask Series Structure:
npartitions=1
day_1    float64
day_9        ...
dtype: float64
Dask Name: dataframe-mean, 7 graph layers

Mean temperatures for first 10 days (in tenths of Â°C):
  day_1: -11.4 (= -1.1Â°C)
  day_2: -16.6 (= -1.7Â°C)
  day_3: -22.9 (= -2.3Â°C)
  day_4: -42.4 (= -4.2Â°C)
  day_5: -40.4 (= -4.0Â°C)
  day_6: -50.5 (= -5.1Â°C)
  day_7: -62.8 (= -6.3Â°C)
  day_8: -62.0 (= -6.2Â°C)
  day_9: -59.0 (= -5.9Â°C)
  day_10: -37.5 (= -3.8Â°C)


## Working with Specific Stations


In [5]:
# Filter for specific stations
# Get all station IDs first
all_stations = df_dask['station_id'].compute()
print(f'Total stations: {len(all_stations)}')
print(f'First 10 stations: {all_stations.head(10).tolist()}')

# Filter for a specific station (lazy)
first_station = all_stations.iloc[0]
station_filter = df_dask['station_id'] == first_station
single_station = df_dask[station_filter]

# Compute to get the data
station_data = single_station.compute()
print(f'\nData for station {first_station}:')
# Now we can use iloc on the pandas dataframe
print(station_data.iloc[:, :10])


Total stations: 4706
First 10 stations: ['CQC00914801', 'FMC00914325', 'FMC00914395', 'FMC00914590', 'FMC00914720', 'GQW00041406', 'RQC00660053', 'RQC00660061', 'RQC00660152', 'RQC00660158']

Data for station CQC00914801:
    station_id  year  day_1  day_2  day_3  day_4  day_5  day_6  day_7  day_8
0  CQC00914801  2025  250.0  244.0  250.0  250.0  256.0  250.0  256.0  244.0


## Converting to Pandas (if data fits in memory)


In [6]:
# Since this dataset is relatively small (~1.7 MB), we can load it all into pandas
df_pandas = df_dask.compute()

print(f'Converted to Pandas DataFrame:')
print(f'  Type: {type(df_pandas)}')
print(f'  Shape: {df_pandas.shape}')
print(f'  Memory usage: {df_pandas.memory_usage(deep=True).sum() / (1024**2):.2f} MB')

print(f'\nFirst 3 rows, first 8 columns:')
# Now we can use iloc since it's pandas
print(df_pandas.iloc[:3, :8])


Converted to Pandas DataFrame:
  Type: <class 'pandas.core.frame.DataFrame'>
  Shape: (4706, 367)
  Memory usage: 13.45 MB

First 3 rows, first 8 columns:
    station_id  year  day_1  day_2  day_3  day_4  day_5  day_6
0  CQC00914801  2025  250.0  244.0  250.0  250.0  256.0  250.0
1  FMC00914325  2025    NaN  294.0  283.0  283.0  256.0  294.0
2  FMC00914395  2025  300.0  278.0  306.0  294.0  283.0  311.0


## Summary

### Key Differences: Dask vs Pandas

| Operation | Pandas | Dask |
|-----------|--------|------|
| **Reading** | `pd.read_parquet()` | `dd.read_parquet()` |
| **Execution** | Immediate (eager) | Lazy (builds computation graph) |
| **Compute** | Automatic | Call `.compute()` to execute |
| **Memory** | Loads entire dataset | Can work with data larger than memory |
| **Partitions** | Single dataframe | Multiple partitions |

### Common Dask Operations

```python
# Reading
df = dd.read_parquet('file.parquet')

# Viewing (triggers computation)
df.head()           # First few rows
df.tail()           # Last few rows
df.compute()        # Convert to pandas (full dataset)

# Lazy operations (no computation yet)
df.mean()           # Calculate means
df[df['col'] > 10]  # Filter data
df.groupby('col').mean()  # Group and aggregate

# Execute computation
result = df.mean().compute()  # Now it actually runs
```


## Plotting TOBS Data for Multiple Stations


In [None]:
# Import plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)

print('âœ“ Plotting libraries imported')


In [None]:
# Select a few stations with good data coverage
# First, let's find stations with the most complete data
day_cols = [col for col in df_pandas.columns if col.startswith('day_')]
df_pandas['completeness'] = df_pandas[day_cols].notna().sum(axis=1)

# Get top 5 stations with most complete data
top_stations = df_pandas.nlargest(5, 'completeness')
print('Top 5 stations by data completeness:')
print(top_stations[['station_id', 'completeness']])
print(f'\nPlotting data for these stations...')


In [None]:
# Plot TOBS for the top 5 stations
fig, ax = plt.subplots(figsize=(14, 8))

days = list(range(1, 366))  # Days 1-365

for idx, row in top_stations.iterrows():
    station_id = row['station_id']
    # Extract temperature values for all days (convert from tenths to degrees C)
    temps = [row[f'day_{d}'] / 10 if pd.notna(row[f'day_{d}']) else np.nan for d in days]
    
    # Plot with gaps where data is missing
    ax.plot(days, temps, marker='o', markersize=2, linewidth=1.5, label=station_id, alpha=0.8)

ax.set_xlabel('Day of Year (2025)', fontsize=12)
ax.set_ylabel('Temperature (Â°C)', fontsize=12)
ax.set_title('TOBS (Temperature at Observation Time) for Top 5 Stations in 2025', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=10)
ax.grid(True, alpha=0.3)
ax.set_xlim(1, 365)

plt.tight_layout()
plt.savefig('tobs_top5_stations.png', dpi=150, bbox_inches='tight')
print('âœ“ Plot saved as tobs_top5_stations.png')
plt.show()


### Plot for Specific Regions

Let's also look at stations from different geographic regions to see variation.


In [None]:
# Select stations from different regions based on station ID prefixes
# US stations start with 'US', Canada with 'CA', etc.
regions = {
    'US': df_pandas[df_pandas['station_id'].str.startswith('US')],
    'CA': df_pandas[df_pandas['station_id'].str.startswith('CA')],
    'GM': df_pandas[df_pandas['station_id'].str.startswith('GM')],  # Germany
    'JA': df_pandas[df_pandas['station_id'].str.startswith('JA')],  # Japan
    'AS': df_pandas[df_pandas['station_id'].str.startswith('AS')],  # Australia
}

print('Stations by region:')
selected_stations = []
for region, stations_df in regions.items():
    if len(stations_df) > 0:
        # Get the station with best coverage from this region
        best = stations_df.nlargest(1, 'completeness')
        if len(best) > 0:
            station_id = best.iloc[0]['station_id']
            completeness = best.iloc[0]['completeness']
            selected_stations.append(station_id)
            print(f'  {region}: {station_id} ({completeness:.0f} days)')

print(f'\nTotal selected: {len(selected_stations)} stations')


In [None]:
# Plot TOBS for stations from different regions
fig, ax = plt.subplots(figsize=(14, 8))

days = list(range(1, 366))
colors = plt.cm.tab10(range(len(selected_stations)))

for i, station_id in enumerate(selected_stations):
    station_row = df_pandas[df_pandas['station_id'] == station_id].iloc[0]
    
    # Extract temperature values (convert from tenths to degrees C)
    temps = [station_row[f'day_{d}'] / 10 if pd.notna(station_row[f'day_{d}']) else np.nan for d in days]
    
    ax.plot(days, temps, marker='o', markersize=2, linewidth=1.5, 
            label=f'{station_id}', alpha=0.8, color=colors[i])

ax.set_xlabel('Day of Year (2025)', fontsize=12)
ax.set_ylabel('Temperature (Â°C)', fontsize=12)
ax.set_title('TOBS Comparison: Stations from Different Regions', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=9)
ax.grid(True, alpha=0.3)
ax.set_xlim(1, 365)

# Add month labels
month_starts = [1, 32, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335]
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
ax.set_xticks(month_starts)
ax.set_xticklabels(month_names)

plt.tight_layout()
plt.savefig('tobs_regional_comparison.png', dpi=150, bbox_inches='tight')
print('âœ“ Plot saved as tobs_regional_comparison.png')
plt.show()


### Individual Station Plots

Create separate subplots for better visibility of individual patterns.


In [None]:
# Create a grid of subplots
n_stations = min(6, len(selected_stations))
fig, axes = plt.subplots(3, 2, figsize=(16, 12))
axes = axes.flatten()

days = list(range(1, 366))
month_starts = [1, 32, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335]
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

for i in range(n_stations):
    if i < len(selected_stations):
        station_id = selected_stations[i]
        station_row = df_pandas[df_pandas['station_id'] == station_id].iloc[0]
        
        # Extract temperature values (use np.nan instead of None)
        temps = np.array([station_row[f'day_{d}'] / 10 if pd.notna(station_row[f'day_{d}']) else np.nan for d in days])
        
        # Calculate completeness
        valid_temps = temps[~np.isnan(temps)]
        completeness = len(valid_temps) / 365 * 100
        
        # Plot
        axes[i].plot(days, temps, color='steelblue', linewidth=1.5, alpha=0.8)
        axes[i].fill_between(days, temps, alpha=0.3, color='steelblue')
        axes[i].set_title(f'{station_id}\n({len(valid_temps)}/365 days, {completeness:.1f}% complete)', 
                         fontsize=11, fontweight='bold')
        axes[i].set_xlabel('Day of Year')
        axes[i].set_ylabel('Temperature (Â°C)')
        axes[i].grid(True, alpha=0.3)
        axes[i].set_xlim(1, 365)
        axes[i].set_xticks(month_starts)
        axes[i].set_xticklabels(month_names, rotation=45)
    else:
        # Hide unused subplots
        axes[i].set_visible(False)

plt.suptitle('TOBS by Station - 2025', fontsize=16, fontweight='bold', y=0.995)
plt.tight_layout()
plt.savefig('tobs_individual_stations.png', dpi=150, bbox_inches='tight')
print('âœ“ Plot saved as tobs_individual_stations.png')
plt.show()


### Summary Statistics

Show overall temperature patterns across all stations.


In [None]:
# Calculate daily statistics across all stations
day_cols = [f'day_{d}' for d in range(1, 366)]
daily_stats = df_pandas[day_cols].describe().T

# Convert to degrees C
daily_stats_celsius = daily_stats / 10

# Extract statistics
days = list(range(1, 366))
mean_temps = daily_stats_celsius['mean'].values
min_temps = daily_stats_celsius['min'].values
max_temps = daily_stats_celsius['max'].values
q25_temps = daily_stats_celsius['25%'].values
q75_temps = daily_stats_celsius['75%'].values

# Create plot
fig, ax = plt.subplots(figsize=(14, 8))

# Plot ranges and means
ax.fill_between(days, min_temps, max_temps, alpha=0.15, color='gray', label='Min-Max Range')
ax.fill_between(days, q25_temps, q75_temps, alpha=0.3, color='steelblue', label='25th-75th Percentile')
ax.plot(days, mean_temps, color='darkred', linewidth=2.5, label='Mean', alpha=0.9)

ax.set_xlabel('Day of Year (2025)', fontsize=12)
ax.set_ylabel('Temperature (Â°C)', fontsize=12)
ax.set_title('Daily Temperature Statistics Across All Stations - 2025', fontsize=14, fontweight='bold')
ax.legend(loc='best', fontsize=11)
ax.grid(True, alpha=0.3)
ax.set_xlim(1, 365)

# Add month labels
month_starts = [1, 32, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335]
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
ax.set_xticks(month_starts)
ax.set_xticklabels(month_names)

plt.tight_layout()
plt.savefig('tobs_daily_statistics.png', dpi=150, bbox_inches='tight')
print('âœ“ Plot saved as tobs_daily_statistics.png')
plt.show()

print(f'\nðŸ“Š Overall Statistics:')
print(f'   Mean temperature: {mean_temps[~np.isnan(mean_temps)].mean():.1f}Â°C')
print(f'   Overall min: {np.nanmin(min_temps):.1f}Â°C')
print(f'   Overall max: {np.nanmax(max_temps):.1f}Â°C')
