# Libraries

In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine, inspect
import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt

from itertools import combinations


import gc # gc.collect() for rubbish collection
import subprocess # cleaning data
import sys

# Data

## Data download from SQL

In [2]:
# DB connection setup using hardcoded credentials (for onboarding only)
conn = psycopg2.connect(
    dbname="neondb",
    user="neondb_owner",
    password="npg_CeS9fJg2azZD",
    host="ep-falling-glitter-a5m0j5gk-pooler.us-east-2.aws.neon.tech",
    port="5432",
    sslmode="require"
)
cur = conn.cursor()


In [3]:
query = "SELECT * FROM nyc_schools.high_school_directory LIMIT 5;"
df = pd.read_sql(query, conn)
df.head()

Unnamed: 0,dbn,school_name,borough,building_code,phone_number,fax_number,grade_span_min,grade_span_max,expgrade_span_min,expgrade_span_max,...,number_programs,Location 1,Community Board,Council District,Census Tract,Zip Codes,Community Districts,Borough Boundaries,City Council Districts,Police Precincts
0,27Q260,Frederick Douglass Academy VI High School,Queens,Q465,718-471-2154,718-471-2890,9.0,12,,,...,1,"{'latitude': '40.601989336', 'longitude': '-73...",14,31,100802,20529,51,3,47,59
1,21K559,Life Academy High School for Film and Music,Brooklyn,K400,718-333-7750,718-333-7775,9.0,12,,,...,1,"{'latitude': '40.593593811', 'longitude': '-73...",13,47,306,17616,21,2,45,35
2,16K393,Frederick Douglass Academy IV Secondary School,Brooklyn,K026,718-574-2820,718-574-2821,9.0,12,,,...,1,"{'latitude': '40.692133704', 'longitude': '-73...",3,36,291,18181,69,2,49,52
3,08X305,Pablo Neruda Academy,Bronx,X450,718-824-1682,718-824-1663,9.0,12,,,...,1,"{'latitude': '40.822303765', 'longitude': '-73...",9,18,16,11611,58,5,31,26
4,03M485,Fiorello H. LaGuardia High School of Music & A...,Manhattan,M485,212-496-0700,212-724-5748,9.0,12,,,...,6,"{'latitude': '40.773670507', 'longitude': '-73...",7,6,151,12420,20,4,19,12


# EDA

## data cleaning

In [4]:
df.shape

(5, 105)

In [5]:
df = df.drop_duplicates()

In [23]:
gc.collect()
df.shape

(5, 57)

In [7]:
df.isnull().sum()

dbn                       0
school_name               0
borough                   0
building_code             0
phone_number              0
                         ..
Zip Codes                 0
Community Districts       0
Borough Boundaries        0
City Council Districts    0
Police Precincts          0
Length: 105, dtype: int64

In [8]:
df.nunique()

dbn                       5
school_name               5
borough                   4
building_code             5
phone_number              5
                         ..
Zip Codes                 5
Community Districts       5
Borough Boundaries        4
City Council Districts    5
Police Precincts          5
Length: 105, dtype: int64

In [9]:
df = df.drop(columns=[col for col in df.columns if df[col].nunique() <= 1])  # dropping columns with no information

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 57 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   dbn                               5 non-null      object 
 1   school_name                       5 non-null      object 
 2   borough                           5 non-null      object 
 3   building_code                     5 non-null      object 
 4   phone_number                      5 non-null      object 
 5   fax_number                        5 non-null      object 
 6   start_time                        5 non-null      object 
 7   end_time                          5 non-null      object 
 8   priority01                        5 non-null      object 
 9   priority02                        5 non-null      object 
 10  priority03                        5 non-null      object 
 11  priority04                        5 non-null      object 
 12  priority05  

In [11]:
df = df.apply(lambda x: x.astype('category') if x.nunique() <= 5 else x)

In [12]:
'''
Alternatively

for col in df.columns:
    if df[col].nunique() <= 5:
        df[col] = df[col].astype('category')
'''       

"\nAlternatively\n\nfor col in df.columns:\n    if df[col].nunique() <= 5:\n        df[col] = df[col].astype('category')\n"

In [13]:
# Columns with ≤ 5 unique values
ready_cols = [col for col in df.columns if df[col].nunique() <= 5]

# Make df_inf with the rest
df_inf = df.drop(columns=ready_cols)

### also

Columns with ≤ 5 unique values
cat_cols = [col for col in df.columns if df[col].nunique() <= 5]

Convert them to category in df
df[cat_cols] = df[cat_cols].apply(lambda x: x.astype('category'))

Create df_inf with all the other columns
df_inf = df.drop(columns=cat_cols)

In [14]:
df_inf.sample(2)

0
2


In [15]:
df_inf.shape

(5, 0)

## data analysis

### How many schools are there in each borough?

In [16]:
query = """
SELECT borough, COUNT(*) AS school_count
FROM nyc_schools.high_school_directory
GROUP BY borough;
"""
df_result = pd.read_sql(query, conn)
df_result

Unnamed: 0,borough,school_count
0,Brooklyn,121
1,Queens,80
2,Staten Island,10
3,Manhattan,106
4,Bronx,118


### School supporting special needs

In [17]:
query = """
SELECT borough, school_accessibility_description, COUNT(*) AS school_count
FROM nyc_schools.high_school_directory
GROUP BY borough, school_accessibility_description
ORDER BY borough, school_count DESC;
"""
df_result = pd.read_sql(query, conn)
df_result

Unnamed: 0,borough,school_accessibility_description,school_count
0,Bronx,Functionally Accessible,93
1,Bronx,Not Functionally Accessible,25
2,Brooklyn,Functionally Accessible,77
3,Brooklyn,Not Functionally Accessible,44
4,Manhattan,Functionally Accessible,78
5,Manhattan,Not Functionally Accessible,28
6,Queens,Functionally Accessible,51
7,Queens,Not Functionally Accessible,29
8,Staten Island,Functionally Accessible,10


### What is the average % of English Language Learners (ELL) per borough?

In [18]:
# I can't find any column with the adequate data

In [19]:
print(df.columns.tolist())

['dbn', 'school_name', 'borough', 'building_code', 'phone_number', 'fax_number', 'start_time', 'end_time', 'priority01', 'priority02', 'priority03', 'priority04', 'priority05', 'website', 'subway', 'bus', 'total_students', 'extracurricular_activities', 'school_sports', 'school_accessibility_description', 'campus_name', 'bin', 'bbl', 'nta', 'primary_address_line_1', 'city', 'postcode', 'school_type', 'overview_paragraph', 'program_highlights', 'language_classes', 'advancedplacement_courses', 'online_ap_courses', 'online_language_courses', 'psal_sports_boys', 'psal_sports_girls', 'psal_sports_coed', 'partner_cbo', 'partner_hospital', 'partner_highered', 'partner_cultural', 'partner_nonprofit', 'partner_corporate', 'partner_financial', 'partner_other', 'addtl_info1', 'addtl_info2', 'number_programs', 'Location 1', 'Community Board', 'Council District', 'Census Tract', 'Zip Codes', 'Community Districts', 'Borough Boundaries', 'City Council Districts', 'Police Precincts']


In [20]:
df['ell_programs'] = (
    df['ell_programs']
    .str.replace('%', '', regex=False)  # remove %
    .astype(float)
)

avg_ell_per_borough = (
    df.groupby('borough')['ell_programs']
      .mean()
      .reset_index(name='avg_ell_percent')
)

KeyError: 'ell_programs'

In [21]:
query = """
SELECT column_name
FROM information_schema.columns
WHERE table_name = 'high_school_directory'
  AND (
       column_name ILIKE '%english%' 
    OR column_name ILIKE '%ell%' 
    OR column_name ILIKE '%language%' 
    OR column_name ILIKE '%score%'
  );
"""
df_result = pd.read_sql(query, conn)
df_result

Unnamed: 0,column_name
0,language_classes
1,online_language_courses
2,ell_programs


In [22]:
query = """
SELECT borough, AVG(CAST(REPLACE(ell_programs, '%', '') AS FLOAT)) AS avg_ell_percent
FROM nyc_schools.high_school_directory
GROUP BY borough
ORDER BY avg_ell_percent DESC;
"""
df_result = pd.read_sql(query, conn)
df_result

DatabaseError: Execution failed on sql '
SELECT borough, AVG(CAST(REPLACE(ell_programs, '%', '') AS FLOAT)) AS avg_ell_percent
FROM nyc_schools.high_school_directory
GROUP BY borough
ORDER BY avg_ell_percent DESC;
': invalid input syntax for type double precision: "ESL"
