# -*- coding: utf-8 -*-

#%% Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

#%% Importing Data
flights_data = pd.read_csv('flights.csv')
flights_data.head(10)
weather_data_pd = pd.read_csv('weather.csv')
weather_data_np = weather_data_pd.to_numpy()
#%% Pandas Data Filtering/Sorting Question Answering
#use flights_data

#Question 1 How many flights were there from JFK to SLC? Int
q_1 

#Question 2 How many airlines fly to SLC? Should be int
q_2 

#Question 3 What is the average arrival delay for flights to RDU? float
q_3

#Question 4 What proportion of flights to SEA come from the two NYC airports (LGA and JFK)?  float
q_4 

#Question 5 Which date has the largest average depature delay? Pd slice with date and float
#please make date a column. Preferred format is 2013/1/1 (y/m/d)
q_5 

#Question 6 Which date has the largest average arrival delay? pd slice with date and float
q_6 

#Question 7 Which flight departing LGA or JFK in 2013 flew the fastest? pd slice with tailnumber and speed
#speed = distance/airtime
q_7

#Question 8 Replace all nans in the weather pd dataframe with 0s. Pd with no nans
q_8 
#%% Numpy Data Filtering/Sorting Question Answering
#Use weather_data_np

#Question 9 How many observations were made in Feburary? Int
q_9 

#Question 10 What was the mean for humidity in February? Float
q_10

#Question 11 What was the std for humidity in February? Float
q_11

In [4]:
# -*- coding: utf-8 -*-

#%% Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

#%% Importing Data
data_folder = os.path.expanduser('~/Downloads/Data Files')

# Load the CSV files
flights_data = pd.read_csv(os.path.join(data_folder, 'flights.csv'))
weather_data_pd = pd.read_csv(os.path.join(data_folder, 'weather.csv'))
weather_data_np = weather_data_pd.to_numpy()

print(f"✓ Successfully loaded flights.csv: {flights_data.shape}")
print(f"✓ Successfully loaded weather.csv: {weather_data_pd.shape}")

#%% Pandas Data Filtering/Sorting Question Answering
# Use flights_data

# Question 1: How many flights were there from JFK to SLC? Int
# Filter flights where origin is JFK AND destination is SLC
jfk_to_slc = flights_data[(flights_data['origin'] == 'JFK') & (flights_data['dest'] == 'SLC')]
q_1 = len(jfk_to_slc)

# Question 2: How many airlines fly to SLC? Should be int
# Filter all flights to SLC, then count unique carriers
flights_to_slc = flights_data[flights_data['dest'] == 'SLC']
q_2 = flights_to_slc['carrier'].nunique()

# Question 3: What is the average arrival delay for flights to RDU? float
# Filter flights to RDU, calculate mean of arr_delay
flights_to_rdu = flights_data[flights_data['dest'] == 'RDU']
q_3 = flights_to_rdu['arr_delay'].mean()

# Question 4: What proportion of flights to SEA come from the two NYC airports (LGA and JFK)? float
# Calculate: (SEA flights from LGA or JFK) / (total SEA flights)
flights_to_sea = flights_data[flights_data['dest'] == 'SEA']
flights_to_sea_from_nyc = flights_to_sea[(flights_to_sea['origin'] == 'LGA') | (flights_to_sea['origin'] == 'JFK')]
q_4 = len(flights_to_sea_from_nyc) / len(flights_to_sea)

# Question 5: Which date has the largest average departure delay? Pd slice with date and float
# Please make date a column. Preferred format is 2013/1/1 (y/m/d)
# Create date column, group by date, find max average departure delay
flights_data['date'] = flights_data['year'].astype(str) + '/' + flights_data['month'].astype(str) + '/' + flights_data['day'].astype(str)
avg_dep_delay_by_date = flights_data.groupby('date')['dep_delay'].mean()
max_date = avg_dep_delay_by_date.idxmax()
max_delay = avg_dep_delay_by_date.max()
q_5 = pd.DataFrame({'date': [max_date], 'avg_dep_delay': [max_delay]})

# Question 6: Which date has the largest average arrival delay? pd slice with date and float
# Group by date, find max average arrival delay
avg_arr_delay_by_date = flights_data.groupby('date')['arr_delay'].mean()
max_arr_date = avg_arr_delay_by_date.idxmax()
max_arr_delay = avg_arr_delay_by_date.max()
q_6 = pd.DataFrame({'date': [max_arr_date], 'avg_arr_delay': [max_arr_delay]})

# Question 7: Which flight departing LGA or JFK in 2013 flew the fastest? pd slice with tailnumber and speed
# speed = distance/airtime
# Filter LGA or JFK flights, calculate speed, find max
lga_jfk_flights = flights_data[(flights_data['origin'] == 'LGA') | (flights_data['origin'] == 'JFK')]
lga_jfk_flights = lga_jfk_flights.dropna(subset=['air_time'])
lga_jfk_flights['speed'] = lga_jfk_flights['distance'] / lga_jfk_flights['air_time']
fastest_idx = lga_jfk_flights['speed'].idxmax()
fastest_flight = lga_jfk_flights.loc[fastest_idx]
q_7 = pd.DataFrame({'tailnum': [fastest_flight['tailnum']], 'speed': [fastest_flight['speed']]})

# Question 8: Replace all nans in the weather pd dataframe with 0s. Pd with no nans
# Use fillna() to replace all NaN values with 0
q_8 = weather_data_pd.fillna(0)

#%% Numpy Data Filtering/Sorting Question Answering
# Use weather_data_np
# Note: Column indices - month: 3, humidity: 8

# Question 9: How many observations were made in February? Int
# Extract month column, count where month == 2
months = weather_data_np[:, 3]
feb_mask = months == 2
q_9 = np.sum(feb_mask)

# Question 10: What was the mean for humidity in February? Float
# Use February mask, extract humidity column, calculate mean
feb_humidity = weather_data_np[feb_mask, 8]
feb_humidity_clean = feb_humidity.astype(float)
q_10 = np.nanmean(feb_humidity_clean)

# Question 11: What was the std for humidity in February? Float
# Calculate standard deviation of February humidity
q_11 = np.nanstd(feb_humidity_clean)

#%% Print Results Summary
print("\n" + "="*60)
print("ASSIGNMENT ANSWERS")
print("="*60)
print(f"Q1: Flights from JFK to SLC: {q_1}")
print(f"Q2: Airlines flying to SLC: {q_2}")
print(f"Q3: Average arrival delay to RDU: {q_3:.2f} minutes")
print(f"Q4: Proportion of SEA flights from NYC: {q_4:.4f}")
print(f"Q5: Date with largest avg dep delay:\n{q_5}")
print(f"Q6: Date with largest avg arr delay:\n{q_6}")
print(f"Q7: Fastest flight from LGA/JFK:\n{q_7}")
print(f"Q8: Weather data shape: {q_8.shape}, NaN count: {q_8.isna().sum().sum()}")
print(f"Q9: February observations: {q_9}")
print(f"Q10: Mean humidity in February: {q_10:.2f}%")
print(f"Q11: Std humidity in February: {q_11:.2f}%")
print("="*60)

✓ Successfully loaded flights.csv: (336776, 17)
✓ Successfully loaded weather.csv: (8719, 15)

ASSIGNMENT ANSWERS
Q1: Flights from JFK to SLC: 2113
Q2: Airlines flying to SLC: 2
Q3: Average arrival delay to RDU: 10.05 minutes
Q4: Proportion of SEA flights from NYC: 0.5333
Q5: Date with largest avg dep delay:
       date  avg_dep_delay
0  2013/3/8      83.536921
Q6: Date with largest avg arr delay:
       date  avg_arr_delay
0  2013/3/8      85.862155
Q7: Fastest flight from LGA/JFK:
  tailnum      speed
0  N666DN  11.723077
Q8: Weather data shape: (8719, 15), NaN count: 0
Q9: February observations: 671
Q10: Mean humidity in February: 62.92%
Q11: Std humidity in February: 20.34%
