<a href="https://colab.research.google.com/github/suptykarmokarcse/weather-d/blob/main/Weather_Dataset_Mileston_02_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Part A — Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display

# --- Configuration ---
FILE_PATH = '/content/weatherHistory (3).csv'
ANALYSIS_COL = 'Apparent Temperature (C)'
STRATA_COL = 'Precip Type'
SAMPLE_SIZE = 50

# Load dataset and display first 5 rows.
df = pd.read_csv(FILE_PATH)
print(f"Loaded file: {FILE_PATH}")
display(df.head())

# Report dataset size.
print("\nDataset size (rows, columns):", df.shape)

# Calculate the population mean for the analysis column.
population_mean = df[ANALYSIS_COL].mean()
print(f"Population mean of '{ANALYSIS_COL}': {population_mean:.4f}")

Loaded file: /content/weatherHistory (3).csv


Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.



Dataset size (rows, columns): (96453, 12)
Population mean of 'Apparent Temperature (C)': 10.8550


## Part B — Simple Random Sampling

In [None]:
# Select n = 50 rows at random using df.sample().
srs = df.sample(n=SAMPLE_SIZE, random_state=42)
print(f"--- Simple Random Sample (n={SAMPLE_SIZE}) ---")
display(srs.head(3))

# Compare sample mean vs population mean for one numeric column.
srs_mean = srs[ANALYSIS_COL].mean()
print(f"SRS Sample mean of '{ANALYSIS_COL}': {srs_mean:.4f}")

--- Simple Random Sample (n=50) ---


Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
87206,2015-09-19 17:00:00.000 +0200,Mostly Cloudy,rain,28.122222,28.827778,0.53,13.3469,304.0,15.5526,0.0,1014.03,Partly cloudy throughout the day.
33421,2009-11-29 14:00:00.000 +0100,Mostly Cloudy,rain,10.0,7.988889,0.83,14.49,150.0,9.982,0.0,1017.9,Foggy starting overnight continuing until morn...
6807,2006-11-18 16:00:00.000 +0100,Mostly Cloudy,rain,14.4,14.4,0.65,11.0446,144.0,9.6278,0.0,1022.0,Partly cloudy starting in the morning.


SRS Sample mean of 'Apparent Temperature (C)': 8.8608


## Part C — Systematic Sampling

In [None]:
n = SAMPLE_SIZE
N = len(df)
k = N // n  # Choose k = N/n
start = np.random.randint(0, k) # Randomize the starting index.
sys_sample = df.iloc[start::k][:n]

print(f"--- Systematic Sample (k={k}, n={n}) ---")
display(sys_sample.head(3))

sys_mean = sys_sample[ANALYSIS_COL].mean()
print(f"Systematic Sample mean of '{ANALYSIS_COL}': {sys_mean:.4f}")

--- Systematic Sample (k=1929, n=50) ---


Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
187,2006-04-16 19:00:00.000 +0200,Mostly Cloudy,rain,14.933333,14.933333,0.73,17.2753,280.0,11.2056,0.0,1011.14,Mostly cloudy throughout the day.
2116,2006-12-06 04:00:00.000 +0100,Foggy,rain,4.827778,1.838889,1.0,13.2503,146.0,0.2093,0.0,1018.32,Foggy starting overnight continuing until morn...
4045,2006-07-25 13:00:00.000 +0200,Partly Cloudy,rain,32.172222,32.905556,0.42,4.3953,275.0,10.0464,0.0,1017.07,Partly cloudy starting in the morning continui...


Systematic Sample mean of 'Apparent Temperature (C)': 11.0458


## Part D — Stratified Sampling

In [None]:
print(f"--- Stratified Sample (Stratifying by '{STRATA_COL}') ---")
# Check the number of records per stratum in the population
print("Records per stratum in population:")
population_strata_counts = df[STRATA_COL].value_counts()
print(population_strata_counts)

# Calculate the proportional fraction for each group
frac = SAMPLE_SIZE / N

# Sample proportionally from each group.
stratified_sample = df.groupby(STRATA_COL, group_keys=False).sample(frac=frac, random_state=42)

print(f"Stratified sample size: {len(stratified_sample)}")
display(stratified_sample.head(3))

strat_mean = stratified_sample[ANALYSIS_COL].mean()
print(f"Stratified Sample mean of '{ANALYSIS_COL}': {strat_mean:.4f}")

# Compare population vs sample distributions for the stratification column.
print("\nRecords per stratum in stratified sample:")
sample_strata_counts = stratified_sample[STRATA_COL].value_counts()
print(sample_strata_counts)



--- Stratified Sample (Stratifying by 'Precip Type') ---
Records per stratum in population:
Precip Type
rain    85224
snow    10712
Name: count, dtype: int64
Stratified sample size: 50


Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
74550,2014-06-10 09:00:00.000 +0200,Clear,rain,27.733333,27.661111,0.43,7.7602,59.0,16.1,0.0,1018.51,Partly cloudy until night.
67042,2013-03-30 10:00:00.000 +0100,Partly Cloudy,rain,8.5,7.022222,0.93,9.1126,154.0,10.143,0.0,1005.99,Mostly cloudy throughout the day.
43064,2010-10-07 08:00:00.000 +0200,Partly Cloudy,rain,9.827778,7.894444,0.84,13.6367,180.0,11.4471,0.0,1026.79,Mostly cloudy throughout the day.


Stratified Sample mean of 'Apparent Temperature (C)': 10.2939

Records per stratum in stratified sample:
Precip Type
rain    44
snow     6
Name: count, dtype: int64


## Part E — Cluster Sampling

In [None]:
# Divide dataset into clusters (e.g., by index).
NUM_CLUSTERS = 10 # We'll create 10 arbitrary clusters
# Assign cluster IDs based on index
df['cluster_id'] = df.index // (N // NUM_CLUSTERS)

# Randomly select clusters and include all rows from them.
NUM_SELECTED_CLUSTERS = 2
selected_clusters = np.random.choice(
    df['cluster_id'].unique(),
    size=NUM_SELECTED_CLUSTERS,
    replace=False
)
cluster_sample = df[df['cluster_id'].isin(selected_clusters)].copy()
cluster_sample_size = len(cluster_sample)

print(f"--- Cluster Sample (Selected {NUM_SELECTED_CLUSTERS} clusters) ---")
print("Selected clusters:", selected_clusters)
print(f"Cluster sample size: {cluster_sample_size}")
display(cluster_sample.head(3))

cluster_mean = cluster_sample[ANALYSIS_COL].mean()
print(f"Cluster Sample mean of '{ANALYSIS_COL}': {cluster_mean:.4f}")

# Clean up the cluster_id column from the main DataFrame
df = df.drop(columns=['cluster_id'])

--- Cluster Sample (Selected 2 clusters) ---
Selected clusters: [3 8]
Cluster sample size: 19290


Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary,cluster_id
28935,2009-02-24 15:00:00.000 +0100,Partly Cloudy,rain,3.888889,0.761111,0.7,12.88,70.0,11.27,0.0,1020.2,Partly cloudy throughout the day.,3
28936,2009-02-24 16:00:00.000 +0100,Partly Cloudy,rain,2.583333,-1.116667,0.68,14.4578,60.0,9.4185,0.0,1020.78,Partly cloudy throughout the day.,3
28937,2009-02-24 17:00:00.000 +0100,Partly Cloudy,snow,0.0,-4.3,0.75,14.49,70.0,8.05,0.0,1021.5,Partly cloudy throughout the day.,3


Cluster Sample mean of 'Apparent Temperature (C)': 10.2268


## Part F — Comparison & Reflection (10 points)

In [None]:
# Compare sample means from all methods.
comparison = pd.DataFrame({
    'Method': ['Simple Random', 'Systematic', 'Stratified', 'Cluster'],
    'Sample Mean': [srs_mean, sys_mean, strat_mean, cluster_mean],
    f'Population Mean ({ANALYSIS_COL})': [population_mean]*4,
    'Absolute Difference': [abs(srs_mean - population_mean),
                            abs(sys_mean - population_mean),
                            abs(strat_mean - population_mean),
                            abs(cluster_mean - population_mean)]
})

# Sort by difference to easily see the most accurate method
comparison = comparison.sort_values(by='Absolute Difference')

print("--- Sample Mean Comparison ---")
display(comparison.reset_index(drop=True).style.format({
    'Sample Mean': "{:.2f}",
    f'Population Mean ({ANALYSIS_COL})': "{:.2f}",
    'Absolute Difference': "{:.2f}"
}).set_properties(subset=['Method'], **{'text-align': 'left'}).set_properties(subset=['Sample Mean', f'Population Mean ({ANALYSIS_COL})', 'Absolute Difference'], **{'text-align': 'right'}))

--- Sample Mean Comparison ---


Unnamed: 0,Method,Sample Mean,Population Mean (Apparent Temperature (C)),Absolute Difference
0,Systematic,11.05,10.86,0.19
1,Stratified,10.29,10.86,0.56
2,Cluster,10.23,10.86,0.63
3,Simple Random,8.86,10.86,1.99
