In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd

file_path = '/content/drive/MyDrive/BootCamp@PureLogics/hourly_weather_10_days.csv'
df = pd.read_csv(file_path)

# Display first 5 rows
df.head(10)

Unnamed: 0,timestamp,temperature_C,humidity_%,wind_speed_kmph,pressure_hPa,visibility_km
0,2023-03-01 00:00:00,16.6,74.4,5.7,1012.5,9.5
1,2023-03-01 01:00:00,16.2,78.5,5.0,1012.1,10.3
2,2023-03-01 02:00:00,15.3,73.3,4.7,,11.1
3,2023-03-01 03:00:00,15.8,72.4,1.3,1005.0,8.9
4,2023-03-01 04:00:00,20.9,70.6,6.8,1016.3,9.8
5,2023-03-01 05:00:00,20.8,63.7,,1017.9,10.8
6,2023-03-01 06:00:00,22.8,67.0,11.3,1002.3,9.0
7,2023-03-01 07:00:00,22.5,63.7,12.3,1018.6,10.9
8,2023-03-01 08:00:00,21.2,65.3,13.8,1017.3,10.7
9,2023-03-01 09:00:00,28.2,64.5,10.8,1009.6,11.0


In [4]:
print(df.info())
print(df.describe())
print(df.isna().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   timestamp        240 non-null    object 
 1   temperature_C    228 non-null    float64
 2   humidity_%       224 non-null    float64
 3   wind_speed_kmph  226 non-null    float64
 4   pressure_hPa     223 non-null    float64
 5   visibility_km    228 non-null    float64
dtypes: float64(5), object(1)
memory usage: 11.4+ KB
None
       temperature_C  humidity_%  wind_speed_kmph  pressure_hPa  visibility_km
count     228.000000  224.000000       226.000000    223.000000     228.000000
mean       21.315789   66.795982        10.105310   1011.884753       9.989474
std         3.421237    8.190300         3.940668      5.187080       1.022166
min        11.500000   47.800000         1.300000    998.100000       6.800000
25%        18.700000   61.075000         6.625000   1008.900000       9.275

In [5]:
# Fill missing values for each column using mean
df['temperature_C'] = df['temperature_C'].fillna(df['temperature_C'].mean())
df['humidity_%'] = df['humidity_%'].fillna(df['humidity_%'].mean())
df['wind_speed_kmph'] = df['wind_speed_kmph'].fillna(df['wind_speed_kmph'].mean())
df['pressure_hPa'] = df['pressure_hPa'].fillna(df['pressure_hPa'].mean())
df['visibility_km'] = df['visibility_km'].fillna(df['visibility_km'].mean())

# Verify no missing values remain
print(df.isnull().sum())

timestamp          0
temperature_C      0
humidity_%         0
wind_speed_kmph    0
pressure_hPa       0
visibility_km      0
dtype: int64


In [6]:
# Convert timestamp to datetime format (so Python understands it's a date/time)
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Extract DATE (for daily averages)
df['date'] = df['timestamp'].dt.date

# Extract HOUR (for hourly analysis)
df['hour'] = df['timestamp'].dt.hour

print("Added date/hour columns!")

Added date/hour columns!


In [7]:
# Group data by date, then calculate mean temperature for each group
daily_avg_temp = df.groupby('date')['temperature_C'].mean()

print("Daily Average Temperatures:")
print(daily_avg_temp.head())  # Show first 5 days

Daily Average Temperatures:
date
2023-03-01    21.263158
2023-03-02    21.258991
2023-03-03    21.304825
2023-03-04    21.425658
2023-03-05    21.529825
Name: temperature_C, dtype: float64


In [8]:
# Calculate stats for each column
stats = {
    'temperature_C': {
        'max': df['temperature_C'].max(),
        'min': df['temperature_C'].min(),
        'mean': df['temperature_C'].mean()
    },
    'humidity_%': {
        'max': df['humidity_%'].max(),
        'min': df['humidity_%'].min(),
        'mean': df['humidity_%'].mean()
    },
    'wind_speed_kmph': {
        'max': df['wind_speed_kmph'].max(),
        'min': df['wind_speed_kmph'].min(),
        'mean': df['wind_speed_kmph'].mean()
    },
    'pressure_hPa': {
        'max': df['pressure_hPa'].max(),
        'min': df['pressure_hPa'].min(),
        'mean': df['pressure_hPa'].mean()
    },
    'visibility_km': {
        'max': df['visibility_km'].max(),
        'min': df['visibility_km'].min(),
        'mean': df['visibility_km'].mean()
    }
}

print("Max/Min/Mean Stats:")
print(stats)

Max/Min/Mean Stats:
{'temperature_C': {'max': 28.7, 'min': 11.5, 'mean': np.float64(21.31578947368421)}, 'humidity_%': {'max': 88.1, 'min': 47.8, 'mean': np.float64(66.79598214285714)}, 'wind_speed_kmph': {'max': 17.8, 'min': 1.3, 'mean': np.float64(10.105309734513273)}, 'pressure_hPa': {'max': 1027.0, 'min': 998.1, 'mean': np.float64(1011.8847533632287)}, 'visibility_km': {'max': 12.6, 'min': 6.8, 'mean': np.float64(9.989473684210529)}}


In [9]:
# Group by hour, calculate mean humidity for each hour
avg_humidity_by_hour = df.groupby('hour')['humidity_%'].mean()

# Find the hour with HIGHEST average humidity
most_humid_hour = avg_humidity_by_hour.idxmax()
most_humid_value = avg_humidity_by_hour.max()

print(f"Most humid hour: {most_humid_hour}:00")
print(f"Average humidity at that hour: {most_humid_value:.1f}%")

Most humid hour: 1:00
Average humidity at that hour: 78.4%


In [10]:
import numpy as np

# Convert temperature and wind speed columns to NumPy arrays
temp = df['temperature_C'].to_numpy()
wind = df['wind_speed_kmph'].to_numpy()

print("Temperature array:", temp[:5])
print("Wind speed array:", wind[:5])

Temperature array: [16.6 16.2 15.3 15.8 20.9]
Wind speed array: [5.7 5.  4.7 1.3 6.8]


In [11]:
# 1. Element-wise addition (temperature + wind speed)
combined = temp + wind
print("Temperature + Wind:", combined[:5])

# 2. Scalar multiplication (double the wind speeds)
doubled_wind = wind * 2
print("Doubled wind speeds:", doubled_wind[:5])

# 3. Dot product (relationship between temp and wind)
dot_product = np.dot(temp, wind)
print("Dot product:", dot_product)


Temperature + Wind: [22.3 21.2 20.  17.1 27.7]
Doubled wind speeds: [11.4 10.   9.4  2.6 13.6]
Dot product: 53718.145947834186


In [12]:
# Reshape to column vectors
temp_col = temp.reshape(-1, 1)
wind_col = wind.reshape(-1, 1)

# Stack horizontally to create a feature matrix
feature_matrix = np.hstack((temp_col, wind_col))
print("Feature matrix shape:", feature_matrix.shape)
print("First 5 rows:\n", feature_matrix[:5])

Feature matrix shape: (240, 2)
First 5 rows:
 [[16.6  5.7]
 [16.2  5. ]
 [15.3  4.7]
 [15.8  1.3]
 [20.9  6.8]]


In [13]:
import numpy as np

# Sample data (240 hourly temperatures)
# Normally distributed around 20°C

temp = np.random.normal(20, 5, 240)

# Reshape into 10 days × 24 hours
temp_matrix = temp.reshape(10, 24)

print("First 2 days (first 2 rows):")
print(temp_matrix[:2])

First 2 days (first 2 rows):
[[23.36068435 20.38237619 12.10123311 18.05196772 13.34834682 17.02839326
  25.28104097 16.94868037 13.29384748 17.95151869 29.78319988 17.92340727
  21.17151667 15.48962189 23.12736652 28.92845857 14.71405653 15.2136193
  17.31208857 31.44044675 25.18258627 21.87694195 11.59270395 22.2011885 ]
 [12.5306269  16.65034316 18.48329799 28.51690406 17.20065524 28.51371568
  19.72591449 22.49592584 13.28070531 21.36428153 13.96603259 16.17270549
  18.51168785 20.61381148 21.74235672 21.33831645 26.8729728  26.33256506
  13.20423341 16.87224736 21.52880327 16.63687441 21.38129116 21.63659986]]


In [14]:
# Daily minimums (coldest hour each day)
daily_min = np.min(temp_matrix, axis=1)

# Daily maximums (hottest hour each day)
daily_max = np.max(temp_matrix, axis=1)

# Daily averages
daily_mean = np.mean(temp_matrix, axis=1)

print("\nDaily minimums:", daily_min)
print("Daily maximums:", daily_max)
print("Daily averages:", daily_mean)


Daily minimums: [11.59270395 12.5306269  12.11773004  9.68825803 12.88662186 14.22433776
 12.65660481 13.18284566  8.74049662 10.15968488]
Daily maximums: [31.44044675 28.51690406 31.87968954 26.59072872 30.24669263 35.57157699
 29.17372496 30.26082387 36.70134446 27.71767739]
Daily averages: [19.73772048 19.81553617 21.64796698 19.56955544 19.83690922 20.41543655
 21.40595238 18.54298055 19.46535849 20.08203455]


In [15]:
# 1. Extract wind speed values as a NumPy array
wind_speeds = df['wind_speed_kmph'].to_numpy()

# 2. Create boolean mask for speeds > 15 km/h
high_wind_mask = wind_speeds > 15

# 3. Apply mask to get only high wind instances
high_wind_speeds = wind_speeds[high_wind_mask]

# 4. Get corresponding timestamps for high winds
high_wind_timestamps = df.loc[high_wind_mask, 'timestamp']

# 5. Display results
print(f"Found {len(high_wind_speeds)} high wind events (>15 km/h):")
print("Wind speeds:", high_wind_speeds)
print("Occurred at:", high_wind_timestamps.values)

Found 30 high wind events (>15 km/h):
Wind speeds: [17.6 16.  16.5 16.3 16.7 15.8 17.8 15.1 16.3 15.2 17.  15.9 15.6 15.8
 15.4 15.6 16.3 15.3 16.2 16.9 15.3 15.2 15.5 17.4 17.4 15.4 15.4 16.5
 17.  15.7]
Occurred at: ['2023-03-01T11:00:00.000000000' '2023-03-01T12:00:00.000000000'
 '2023-03-01T13:00:00.000000000' '2023-03-02T10:00:00.000000000'
 '2023-03-03T09:00:00.000000000' '2023-03-03T10:00:00.000000000'
 '2023-03-03T11:00:00.000000000' '2023-03-03T12:00:00.000000000'
 '2023-03-03T15:00:00.000000000' '2023-03-03T16:00:00.000000000'
 '2023-03-04T11:00:00.000000000' '2023-03-04T12:00:00.000000000'
 '2023-03-05T09:00:00.000000000' '2023-03-05T11:00:00.000000000'
 '2023-03-05T12:00:00.000000000' '2023-03-05T14:00:00.000000000'
 '2023-03-06T13:00:00.000000000' '2023-03-07T10:00:00.000000000'
 '2023-03-07T11:00:00.000000000' '2023-03-08T10:00:00.000000000'
 '2023-03-08T12:00:00.000000000' '2023-03-08T13:00:00.000000000'
 '2023-03-09T09:00:00.000000000' '2023-03-09T10:00:00.000000000'
 '

In [16]:
import numpy as np

def daily_summary(matrix):
    """Generate daily statistics for a temperature matrix.

    Args:
        matrix: NumPy array of shape (10, 24) representing 10 days of hourly temps

    Returns:
        List of dictionaries with min, max, and mean for each day
    """
    summaries = []
    for day in range(matrix.shape[0]):
        daily_data = matrix[day, :]

        summary = {
            'day': day + 1,
            'min': np.min(daily_data),
            'max': np.max(daily_data),
            'mean': np.mean(daily_data)
        }
        summaries.append(summary)

    return summaries

# Test with sample data
temp_matrix = np.random.normal(20, 5, (10, 24))
summaries = daily_summary(temp_matrix)

# Print first 3 days' summaries
for day in summaries[:3]:
    print(f"Day {day['day']}: Min={day['min']:.1f}°C, Max={day['max']:.1f}°C, Mean={day['mean']:.1f}°C")

Day 1: Min=12.2°C, Max=26.8°C, Mean=19.2°C
Day 2: Min=10.6°C, Max=25.1°C, Mean=19.0°C
Day 3: Min=10.6°C, Max=29.4°C, Mean=19.7°C
