# Year-over-Year DX Applications Analysis by Top 100 Cities
## CURSOR GENERATED
Reviewed by Ax:

This notebook runs the YoY analysis SQL query to calculate lifetime DX applications by top 100 cities with time-based breakdowns and growth rates.


In [None]:
import sys
import os
import pandas as pd
from datetime import datetime

# Add utils to path
sys.path.append('../../utils')
from snowflake_connection import SnowflakeHook

print(f"Analysis started at: {datetime.now()}")


In [None]:
# Read the SQL query
with open('yoy.sql', 'r') as f:
    sql_query = f.read()

print("SQL Query loaded successfully")
print(f"Query length: {len(sql_query)} characters")


In [None]:
# Initialize Snowflake connection
snowhook = SnowflakeHook()
print("Snowflake connection initialized")


In [None]:
# Split the query into separate statements
query_parts = sql_query.split(';')
query_parts = [part.strip() for part in query_parts if part.strip()]

print("Executing SQL query in parts...")
print("This may take a few minutes due to the large dataset and aggregations...")

try:
    # Execute the CREATE TABLE statement
    create_table_query = query_parts[0] + ';'
    print("Step 1: Creating table...")
    snowhook.query_snowflake(create_table_query, method='cursor')
    print("Table created successfully!")
    
    # Execute the GRANT statement
    grant_query = query_parts[1] + ';'
    print("Step 2: Granting permissions...")
    snowhook.query_snowflake(grant_query, method='cursor')
    print("Permissions granted successfully!")
    
    # Execute the SELECT statement to get results
    select_query = query_parts[2]
    print("Step 3: Fetching results...")
    df = snowhook.query_snowflake(select_query, method='pandas')
    print(f"Query executed successfully!")
    print(f"Results shape: {df.shape}")
    
except Exception as e:
    print(f"Error executing query: {str(e)}")
    raise


In [None]:
# Display basic info about the results
print("=== QUERY RESULTS SUMMARY ===")
print(f"Total cities analyzed: {len(df)}")
print(f"Columns: {list(df.columns)}")
print("\nFirst 10 rows:")
df.head(10)


In [None]:
# Show summary statistics
print("=== SUMMARY STATISTICS ===")
print("\nDescriptive statistics for numeric columns:")
df.describe()


In [None]:
# Show top 20 cities by YoY growth
print("=== TOP 20 CITIES BY YOY GROWTH ===")
top_yoy = df.nlargest(20, 'YOY')
print(top_yoy[['CITY_STATE', 'APPS_18_PLUS_AS_OF_H1_2024', 'APPS_18_PLUS_AS_OF_H2_2025', 'YOY']].to_string(index=False))


In [None]:
# Show bottom 20 cities by YoY growth
print("=== BOTTOM 20 CITIES BY YOY GROWTH ===")
bottom_yoy = df.nsmallest(20, 'YOY')
print(bottom_yoy[['CITY_STATE', 'APPS_18_PLUS_AS_OF_H1_2024', 'APPS_18_PLUS_AS_OF_H2_2025', 'YOY']].to_string(index=False))


In [None]:
# Show cities with highest absolute numbers
print("=== TOP 20 CITIES BY TOTAL LIFETIME APPLICATIONS ===")
top_absolute = df.nlargest(20, 'APPS_18PLUS')
print(top_absolute[['CITY_STATE', 'APPS_18PLUS', 'APPS_18_PLUS_AS_OF_H1_2024', 'APPS_18_PLUS_AS_OF_H2_2025', 'YOY']].to_string(index=False))


In [None]:
# Save results to CSV for further analysis
output_file = f"yoy_analysis_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
df.to_csv(output_file, index=False)
print(f"\nResults saved to: {output_file}")
print(f"Analysis completed at: {datetime.now()}")
