In [1]:
import pandas as pd
from sqlalchemy import create_engine
from pathlib import Path
import re
import duckdb
import numpy as np
from scipy.stats import skew
from funcz import StatWhiz
import warnings
import ipywidgets as widgets
from IPython.display import display
warnings.filterwarnings('ignore')
engine = duckdb.connect('my_db.duckdb')

## 2.1: Crime Rate Overview

In [2]:
stat_dude =  StatWhiz(pd.read_sql(con=engine, sql='''

WITH crime_CTE AS (
  SELECT community_board, CAST(YEAR AS int) AS year, SUM(crimes) AS crimes FROM (
  SELECT 
  pd.community_board, 
  pd.precinct,
  cd.YEAR,
  crimes/COUNT(community_board) OVER(PARTITION BY cd.YEAR,pd.precinct) AS crimes
  FROM precinct_data pd 
  LEFT JOIN (
    select
    cd.YEAR,
    cd.pct,
    SUM(count) AS crimes
    FROM crime_data cd
    --WHERE crime_category = 'major'
    GROUP BY 1,2
  ) cd ON pd.precinct  = cd.pct 
) sq GROUP BY 1,2
),

population_CTE AS (
  SELECT 
  CAST(dd.YEAR AS int) AS year,
  dd.community_board,
  SUM(dd.value) AS population
  FROM demo_data dd 
  WHERE metric = 'Population'
  GROUP BY 1,2

)

  SELECT 
  c.community_board,
  c.crimes,
  c.YEAR,
  c.crimes/p.population * 100000 AS crime_rate
  FROM crime_CTE c
  LEFT JOIN population_CTE p ON c.YEAR = p.YEAR AND c.community_board = p.community_board
  WHERE c.YEAR >= 2007




''')
)


stat_dude.box_plot(y="crime_rate")
stat_dude.describe_table()

## Crime Rate Over Time

In [2]:
stat_dude =  StatWhiz(pd.read_sql(con=engine, sql='''

WITH crime_CTE AS (
  SELECT community_board, CAST(YEAR AS int) AS year, SUM(crimes) AS crimes FROM (
  SELECT 
  pd.community_board, 
  pd.precinct,
  cd.YEAR,
  crimes/COUNT(community_board) OVER(PARTITION BY cd.YEAR,pd.precinct) AS crimes
  FROM precinct_data pd 
  LEFT JOIN (
    select
    cd.YEAR,
    cd.pct,
    SUM(count) AS crimes
    FROM crime_data cd
    --WHERE crime_category = 'major'
    GROUP BY 1,2
  ) cd ON pd.precinct  = cd.pct 
) sq GROUP BY 1,2
),

population_CTE AS (
  SELECT 
  CAST(dd.YEAR AS int) AS year,
  dd.community_board,
  SUM(dd.value) AS population
  FROM demo_data dd 
  WHERE metric = 'Population'
  GROUP BY 1,2

)
SELECT DISTINCT YEAR, yearly_crime_rate, AVG(crime_rate) AS average_crime_rate FROM (
  SELECT 
  c.community_board,
  c.crimes,
  c.YEAR,
  p.population,
  SUM(crimes) OVER(PARTITION BY c.year)/SUM(p.population) OVER(PARTITION BY c.year) * 100000 AS yearly_crime_rate,
  c.crimes/p.population * 100000 AS crime_rate
  FROM crime_CTE c
  LEFT JOIN population_CTE p ON c.YEAR = p.YEAR AND c.community_board = p.community_board
  WHERE c.YEAR >= 2007
)

GROUP BY 1,2
ORDER BY 1 




''')
)

stat_dude.line_graph(x="year",y=["yearly_crime_rate", "average_crime_rate"],title="Crime Rate Over Time",show_labels=False)


NameError: name 'StatWhiz' is not defined

## Crime Rate By Community Board

In [5]:
stat_dude =  StatWhiz(pd.read_sql(con=engine, sql='''

WITH crime_CTE AS (
  SELECT community_board, CAST(YEAR AS int) AS year, SUM(crimes) AS crimes FROM (
  SELECT 
  pd.community_board, 
  pd.precinct,
  cd.YEAR,
  crimes/COUNT(community_board) OVER(PARTITION BY cd.YEAR,pd.precinct) AS crimes
  FROM precinct_data pd 
  LEFT JOIN (
    select
    cd.YEAR,
    cd.pct,
    SUM(count) AS crimes
    FROM crime_data cd
    --WHERE crime_category = 'major'
    GROUP BY 1,2
  ) cd ON pd.precinct  = cd.pct 
) sq GROUP BY 1,2
),

population_CTE AS (
  SELECT 
  CAST(dd.YEAR AS int) AS year,
  dd.community_board,
  SUM(dd.value) AS population
  FROM demo_data dd 
  WHERE metric = 'Population'
  GROUP BY 1,2

)
SELECT community_board, AVG(crime_rate) AS average_crime_rate FROM (
  SELECT 
  c.community_board,
  c.crimes,
  c.YEAR,
  c.crimes/p.population * 100000 AS crime_rate
  FROM crime_CTE c
  LEFT JOIN population_CTE p ON c.YEAR = p.YEAR AND c.community_board = p.community_board
  WHERE c.YEAR >= 2007

) GROUP BY 1 ORDER BY 2 DESC
                                  


''')
)


stat_dude.bar_chart(y="average_crime_rate",x="community_board")


In [6]:
stat_dude =  StatWhiz(pd.read_sql(con=engine, sql='''

WITH crime_CTE AS (
  SELECT community_board, CAST(YEAR AS int) AS year, SUM(crimes) AS crimes FROM (
  SELECT 
  pd.community_board, 
  pd.precinct,
  cd.YEAR,
  crimes/COUNT(community_board) OVER(PARTITION BY cd.YEAR,pd.precinct) AS crimes
  FROM precinct_data pd 
  LEFT JOIN (
    select
    cd.YEAR,
    cd.pct,
    SUM(count) AS crimes
    FROM crime_data cd
    --WHERE crime_category = 'major'
    GROUP BY 1,2
  ) cd ON pd.precinct  = cd.pct 
) sq GROUP BY 1,2
),

population_CTE AS (
  SELECT 
  CAST(dd.YEAR AS int) AS year,
  dd.community_board,
  SUM(dd.value) AS population
  FROM demo_data dd 
  WHERE metric = 'Population'
  GROUP BY 1,2

)
SELECT borough, AVG(crime_rate) AS average_crime_rate FROM (
  SELECT 
  c.community_board,
  cd.borough,
  c.crimes,
  c.YEAR,
  c.crimes/p.population * 100000 AS crime_rate
  FROM crime_CTE c
  LEFT JOIN population_CTE p ON c.YEAR = p.YEAR AND c.community_board = p.community_board
  LEFT JOIN community_board_data cd ON c.community_board = cd.community_board
  WHERE c.YEAR >= 2007

) GROUP BY 1 ORDER BY 2 DESC
                                  


''')
)


stat_dude.bar_chart(y="average_crime_rate",x="borough")


## Crime Rate and Economic Effects

In [3]:
stat_dude = StatWhiz(data=pd.read_sql(con=engine, sql='''

WITH crime_CTE AS (
  SELECT community_board, CAST(YEAR AS int) AS year, SUM(crimes) AS crimes FROM (
  SELECT 
  pd.community_board, 
  pd.precinct,
  cd.YEAR,
  crimes/COUNT(community_board) OVER(PARTITION BY cd.YEAR,pd.precinct) AS crimes
  FROM precinct_data pd 
  LEFT JOIN (
    select
    cd.YEAR,
    cd.pct,
    SUM(count) AS crimes
    FROM crime_data cd
    --WHERE crime_category = 'major'
    GROUP BY 1,2
  ) cd ON pd.precinct  = cd.pct 
) sq GROUP BY 1,2
),
pop_data AS (
                                  
  SELECT 
  un.YEAR,
  un.community_board,
  un.unemployment_rate,
  rd.rent/(inc.median_income/12) AS rent_proportion
  FROM unemployment_rate_data un
  LEFT JOIN rent_data rd ON un.year = rd.year AND un.community_board = rd.community_board 
  LEFT JOIN median_income_data inc ON un.year = inc.YEAR AND un.community_board  = inc.community_board
),
population_CTE AS (
  SELECT 
  CAST(dd.YEAR AS int) AS year,
  dd.community_board,
  SUM(dd.value) AS population
  FROM demo_data dd 
  WHERE metric = 'Population'
  GROUP BY 1,2
)

  SELECT 
  c.community_board,
  c.crimes,
  c.YEAR,
  c.crimes/p.population * 100000 AS crime_rate,
  pd.rent_proportion,
  pd.unemployment_rate,
  FROM crime_CTE c
  LEFT JOIN population_CTE p ON c.YEAR = p.YEAR AND c.community_board = p.community_board
  LEFT JOIN pop_data pd ON c.YEAR = pd.YEAR and c.community_board = pd.community_board
  WHERE c.YEAR >= 2007




''')
)


stat_dude.scatter_plot(y="crime_rate",x="rent_proportion")
stat_dude.scatter_plot(y="crime_rate",x="unemployment_rate")




## 2.5: Crime and Race

In [1]:
stat_dude = StatWhiz(data=pd.read_sql(con=engine, sql='''

WITH crime_CTE AS (
  SELECT community_board, CAST(YEAR AS int) AS year, SUM(crimes) AS crimes FROM (
  SELECT 
  pd.community_board, 
  pd.precinct,
  cd.YEAR,
  crimes/COUNT(community_board) OVER(PARTITION BY cd.YEAR,pd.precinct) AS crimes
  FROM precinct_data pd 
  LEFT JOIN (
    select
    cd.YEAR,
    cd.pct,
    SUM(count) AS crimes
    FROM crime_data cd
    --WHERE crime_category = 'major'
    GROUP BY 1,2
  ) cd ON pd.precinct  = cd.pct 
) sq GROUP BY 1,2
),

population_CTE AS (
  SELECT 
  CAST(dd.YEAR AS int) AS year,
  dd.community_board,
  SUM(IF(metric = 'Population',dd.value,0)) AS population,
  SUM(IF(metric = 'White Population',dd.value,0)) AS white_population
  FROM demo_data dd 
  GROUP BY 1,2
)
SELECT
  year,
  AVG(CASE WHEN white_majority = 'White-Majority' THEN crime_rate END) AS avg_crime_rate_white_majority,
  AVG(CASE WHEN white_majority = 'Non-White Majority' THEN crime_rate END) AS avg_crime_rate_non_white_majority


FROM (
  SELECT 
  c.community_board,
  c.crimes,
  c.YEAR,
  c.crimes/p.population * 100000 AS crime_rate,
  IF(white_population/population >  0.5, 'White-Majority', 'Non-White Majority') as white_majority
  FROM crime_CTE c
  LEFT JOIN population_CTE p ON c.YEAR = p.YEAR AND c.community_board = p.community_board
  WHERE c.YEAR >= 2007
)
GROUP BY year
ORDER BY year


''')
)
stat_dude.line_graph(y=["avg_crime_rate_white_majority","avg_crime_rate_non_white_majority"],x='year')





NameError: name 'StatWhiz' is not defined

## Extracting as CSV to use in PowerBi

In [8]:
tables = engine.execute("""
    SELECT table_name FROM information_schema.tables
    WHERE table_schema = 'main' AND table_type = 'BASE TABLE'
""").fetchall()


table_names = [t[0] for t in tables]


dfs = {tbl: engine.execute(f"SELECT * FROM {tbl}").fetchdf() for tbl in table_names}
for table in table_names:
    dfs[table].to_csv(rf'powerbi\{table}.csv')

engine.close()