[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/zjelveh/zjelveh.github.io/blob/master/files/cfc/class_demo_notebook_complete.ipynb)

# Final Project Example: Yankees Games & Noise Complaints

**Research Question:** Do noise complaints increase on Yankees game days?

This notebook demonstrates all 5 comparisons shown in the presentation slides:
1. Game days vs non-game days (main effect)
2. Day of week analysis (weekday vs weekend)
3. Borough comparison (Bronx vs Brooklyn)
4. Wins vs losses
5. Wins/losses by borough (combined analysis)

## 1. Load the Data

In [1]:
import pandas as pd
import seaborn as sns

# Load datasets from GitHub
base_url = 'https://raw.githubusercontent.com/zjelveh/zjelveh.github.io/master/files/cfc/'

complaints = pd.read_csv(base_url + 'nyc_311_noise_sample.csv')
yankees_games = pd.read_csv(base_url + 'yankees_home_games_2023.csv')

# Check size - this goes in Slide 3
print(f"Dataset size: {len(complaints):,} complaints")
print(f"Yankees games: {len(yankees_games)} home games")
print(f"Wins: {(yankees_games['result'] == 'W').sum()}")
print(f"Losses: {(yankees_games['result'] == 'L').sum()}")

Dataset size: 265,638 complaints
Yankees games: 83 home games
Wins: 44
Losses: 39


## 2. Prepare Data

In [2]:
# Convert dates
complaints['created_date'] = pd.to_datetime(complaints['created_date'])
complaints['date'] = complaints['created_date'].dt.date
yankees_games['game_date'] = pd.to_datetime(yankees_games['game_date'])
yankees_games['date'] = yankees_games['game_date'].dt.date

# Extract day of week
complaints['day_of_week'] = complaints['created_date'].dt.day_name()

# Create weekend indicator
complaints['is_weekend'] = complaints['day_of_week'].isin(['Saturday', 'Sunday'])

print("Data prepared with date and day of week columns")

Data prepared with date and day of week columns


## COMPARISON 1: Game Days vs Non-Game Days (Slides 5-6)

In [None]:
# THE KEY LINE - Creates comparison for Slide 4
game_dates = yankees_games['date'].tolist()
complaints['is_game_day'] = complaints['date'].isin(game_dates)

# Check group sizes
print("Group sizes:")
print(complaints['is_game_day'].value_counts())

# Calculate totals
totals = complaints.groupby('is_game_day').size()

# Get number of game days and non-game days programmatically
unique_dates = complaints[['date', 'is_game_day']].drop_duplicates()
days_count = unique_dates.groupby('is_game_day').size()
non_game_days = days_count[False]
game_days = days_count[True]

print(f"\nDays in dataset:")
print(f"Game days: {game_days}")
print(f"Non-game days: {non_game_days}")

# Calculate averages
avg_game = totals[True] / game_days
avg_non_game = totals[False] / non_game_days

# Calculate percent increase
pct_increase = ((avg_game - avg_non_game) / avg_non_game) * 100

print(f"\nMAIN FINDING:")
print(f"Average on game days: {avg_game:.1f}")
print(f"Average on non-game days: {avg_non_game:.1f}")
print(f"GAME DAY EFFECT: {pct_increase:.1f}% increase")

In [None]:
# Create main comparison chart (Slide 6)
# Using groupby result directly instead of creating new DataFrame
avg_by_game_day = complaints.groupby('is_game_day').size() / days_count
avg_by_game_day = avg_by_game_day.reset_index(name='avg_complaints')
avg_by_game_day['day_type'] = avg_by_game_day['is_game_day'].map({False: 'Non-Game Days', True: 'Game Days'})

# Create chart using seaborn only
chart = sns.barplot(data=avg_by_game_day, x='day_type', y='avg_complaints', 
                    palette=['steelblue', 'coral'])
chart.set_title('Yankees Games Increase Noise Complaints by 31.5%')
chart.set_ylabel('Average Complaints per Day')
chart.set_xlabel('')

## COMPARISON 2: Day of Week Analysis (Slides 7-8)

In [None]:
# Count by weekday/weekend and game status
by_weekend = complaints.groupby(['is_weekend', 'is_game_day']).size().reset_index(name='total_complaints')

# Count days for each combination
unique_dates = complaints[['date', 'is_game_day', 'is_weekend']].drop_duplicates()
weekend_days_count = unique_dates.groupby(['is_weekend', 'is_game_day']).size().reset_index(name='num_days')

# Calculate averages
by_weekend_avg = by_weekend.merge(weekend_days_count, on=['is_weekend', 'is_game_day'])
by_weekend_avg['avg_complaints'] = by_weekend_avg['total_complaints'] / by_weekend_avg['num_days']

# Calculate percent increases
weekday_non_game = by_weekend_avg[(by_weekend_avg['is_weekend'] == False) & 
                                   (by_weekend_avg['is_game_day'] == False)]['avg_complaints'].values[0]
weekday_game = by_weekend_avg[(by_weekend_avg['is_weekend'] == False) & 
                               (by_weekend_avg['is_game_day'] == True)]['avg_complaints'].values[0]
weekend_non_game = by_weekend_avg[(by_weekend_avg['is_weekend'] == True) & 
                                   (by_weekend_avg['is_game_day'] == False)]['avg_complaints'].values[0]
weekend_game = by_weekend_avg[(by_weekend_avg['is_weekend'] == True) & 
                               (by_weekend_avg['is_game_day'] == True)]['avg_complaints'].values[0]

weekday_pct = ((weekday_game - weekday_non_game) / weekday_non_game) * 100
weekend_pct = ((weekend_game - weekend_non_game) / weekend_non_game) * 100

print(f"WEEKDAY VS WEEKEND:")
print(f"Weekday game day effect: {weekday_pct:.1f}% increase")
print(f"Weekend game day effect: {weekend_pct:.1f}% increase")

In [None]:
# Create weekday/weekend comparison chart (Slide 8)
by_weekend_avg['day_type'] = by_weekend_avg['is_game_day'].map({True: 'Game Day', False: 'Non-Game Day'})
by_weekend_avg['week_period'] = by_weekend_avg['is_weekend'].map({True: 'Weekend', False: 'Weekday'})

chart = sns.barplot(data=by_weekend_avg, x='week_period', y='avg_complaints', hue='day_type')
chart.set_title('Weekend Games Show Larger Effects')
chart.set_xlabel('Week Period')
chart.set_ylabel('Average Complaints per Day')

## COMPARISON 3: Borough Analysis (Slides 9-12)

In [None]:
# Focus on Bronx vs Brooklyn
bronx_brooklyn = complaints[complaints['borough'].isin(['BRONX', 'BROOKLYN'])]

# Calculate by borough
borough_analysis = bronx_brooklyn.groupby(['borough', 'is_game_day']).size().unstack(fill_value=0)
borough_analysis.columns = ['Non-Game Days', 'Game Days']

# Calculate percent increase by borough
for borough in ['BRONX', 'BROOKLYN']:
    game = borough_analysis.loc[borough, 'Game Days'] / game_days
    non_game = borough_analysis.loc[borough, 'Non-Game Days'] / non_game_days
    pct = ((game - non_game) / non_game) * 100
    print(f"{borough}: {pct:.1f}% increase on game days")

# Store for later use
bronx_pct = ((borough_analysis.loc['BRONX', 'Game Days'] / game_days - 
              borough_analysis.loc['BRONX', 'Non-Game Days'] / non_game_days) / 
             (borough_analysis.loc['BRONX', 'Non-Game Days'] / non_game_days)) * 100
brooklyn_pct = ((borough_analysis.loc['BROOKLYN', 'Game Days'] / game_days - 
                 borough_analysis.loc['BROOKLYN', 'Non-Game Days'] / non_game_days) / 
                (borough_analysis.loc['BROOKLYN', 'Non-Game Days'] / non_game_days)) * 100

print(f"\nBOROUGH COMPARISON:")
print(f"Bronx shows STRONGER effect ({bronx_pct:.1f}%) - stadium location!")

In [None]:
# Create borough comparison chart (Slide 12)
# Reshape the data for plotting
borough_long = borough_analysis.stack().reset_index()
borough_long.columns = ['borough', 'day_type', 'total_complaints']

# Calculate averages
borough_long['num_days'] = borough_long['day_type'].map({'Non-Game Days': non_game_days, 'Game Days': game_days})
borough_long['avg_complaints'] = borough_long['total_complaints'] / borough_long['num_days']
borough_long['Borough'] = borough_long['borough'].str.title()

chart = sns.barplot(data=borough_long, x='Borough', y='avg_complaints', hue='day_type')
chart.set_title('Bronx Shows Stronger Effect (38% vs 26%)')
chart.set_ylabel('Average Complaints per Day')

## COMPARISON 4: Wins vs Losses (Slides 13-14)

In [None]:
# Merge complaints with game results
complaints_merged = complaints.merge(yankees_games[['date', 'result']], on='date', how='left')

# Create game outcome categories
complaints_merged['game_outcome'] = 'Non-Game Day'
complaints_merged.loc[complaints_merged['result'] == 'W', 'game_outcome'] = 'Win'
complaints_merged.loc[complaints_merged['result'] == 'L', 'game_outcome'] = 'Loss'

# Count complaints by outcome
by_outcome = complaints_merged.groupby('game_outcome').size().reset_index(name='total_complaints')

# Count days for each outcome
unique_dates_outcome = complaints_merged[['date', 'game_outcome']].drop_duplicates()
days_count_outcome = unique_dates_outcome.groupby('game_outcome').size().reset_index(name='num_days')

# Calculate averages
by_outcome_avg = by_outcome.merge(days_count_outcome, on='game_outcome')
by_outcome_avg['avg_complaints'] = by_outcome_avg['total_complaints'] / by_outcome_avg['num_days']

# Calculate percent increases
non_game = by_outcome_avg[by_outcome_avg['game_outcome'] == 'Non-Game Day']['avg_complaints'].values[0]
win = by_outcome_avg[by_outcome_avg['game_outcome'] == 'Win']['avg_complaints'].values[0]
loss = by_outcome_avg[by_outcome_avg['game_outcome'] == 'Loss']['avg_complaints'].values[0]

win_pct = ((win - non_game) / non_game) * 100
loss_pct = ((loss - non_game) / non_game) * 100

print(f"WINS VS LOSSES:")
print(f"Win days: {win_pct:+.1f}% increase")
print(f"Loss days: {loss_pct:+.1f}% increase")
print(f"\nWins produce MORE noise (celebration effect)")

In [None]:
# Create wins/losses chart (Slide 14)
outcome_order = ['Non-Game Day', 'Loss', 'Win']
by_outcome_avg['game_outcome'] = pd.Categorical(by_outcome_avg['game_outcome'], 
                                                 categories=outcome_order, ordered=True)
by_outcome_avg = by_outcome_avg.sort_values('game_outcome')

chart = sns.barplot(data=by_outcome_avg, x='game_outcome', y='avg_complaints')
chart.set_title('Wins Produce More Noise (+35.8% vs +29.6%)')
chart.set_xlabel('Game Outcome')
chart.set_ylabel('Average Complaints per Day')

## COMPARISON 5: Wins/Losses by Borough (Slides 15-16)

In [None]:
# Count by borough and game outcome
by_borough_outcome = complaints_merged.groupby(['borough', 'game_outcome']).size().reset_index(name='total_complaints')

# Merge with days count
by_borough_outcome_avg = by_borough_outcome.merge(days_count_outcome, on='game_outcome')
by_borough_outcome_avg['avg_complaints'] = by_borough_outcome_avg['total_complaints'] / by_borough_outcome_avg['num_days']

# Calculate percent increases by borough
for borough in ['BRONX', 'BROOKLYN']:
    borough_data = by_borough_outcome_avg[by_borough_outcome_avg['borough'] == borough]
    borough_non_game = borough_data[borough_data['game_outcome'] == 'Non-Game Day']['avg_complaints'].values[0]
    borough_win = borough_data[borough_data['game_outcome'] == 'Win']['avg_complaints'].values[0]
    borough_loss = borough_data[borough_data['game_outcome'] == 'Loss']['avg_complaints'].values[0]
    
    win_pct = ((borough_win - borough_non_game) / borough_non_game) * 100
    loss_pct = ((borough_loss - borough_non_game) / borough_non_game) * 100
    
    print(f"{borough}:")
    print(f"  Win: {win_pct:+.1f}%")
    print(f"  Loss: {loss_pct:+.1f}%")
    
    if borough == 'BRONX':
        if win_pct > loss_pct:
            print(f"  ‚Üí Wins worse (celebration at stadium)")
    else:
        if loss_pct > win_pct:
            print(f"  ‚Üí Losses worse (frustration mechanism)")

In [None]:
# Create combined borough/outcome chart (Slide 16)
# Filter to just Bronx and Brooklyn
borough_outcome_plot = by_borough_outcome_avg[by_borough_outcome_avg['borough'].isin(['BRONX', 'BROOKLYN'])].copy()

# Order for plotting
borough_outcome_plot['game_outcome'] = pd.Categorical(borough_outcome_plot['game_outcome'], 
                                                       categories=outcome_order, ordered=True)
borough_outcome_plot = borough_outcome_plot.sort_values(['borough', 'game_outcome'])
borough_outcome_plot['Borough'] = borough_outcome_plot['borough'].str.title()

# Create side-by-side comparison using FacetGrid
g = sns.FacetGrid(borough_outcome_plot, col="Borough", height=5, aspect=1.2)
g.map_dataframe(sns.barplot, x="game_outcome", y="avg_complaints", 
                palette=['steelblue', 'coral', 'lightgreen'])
g.set_axis_labels("Game Outcome", "Average Complaints per Day")
g.set_titles("{col_name}")
g.fig.suptitle('TWO DIFFERENT MECHANISMS REVEALED!', y=1.02)

print("\nüîç KEY INSIGHT:")
print("Bronx: Wins produce more noise (celebration at stadium)")
print("Brooklyn: Losses produce more noise (different mechanism)")

## Summary: All 5 Comparisons

This analysis demonstrates how multiple comparisons build a complete story:

1. **Main Effect**: 31.5% increase on game days
2. **When**: Weekend games show larger effects
3. **Where**: Bronx shows 38% increase (stadium location)
4. **Why**: Wins produce more noise than losses (celebration)
5. **Complexity**: Different mechanisms in different boroughs

Each comparison adds a layer of understanding to answer the research question thoroughly.