# Data Science Capstone Project

This project is to practice all skills learn from the entire course and apply those knowledge to a near industrial project.

In [127]:
# To purse JSON and geoJSON
import json
from shapely.geometry import shape, Point   # Depending on your version, use: from shapely.geometry import shape, Point

# !pip install geocoder
import geocoder

import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt

# import k-means from clustering stage
from sklearn.cluster import KMeans

#!conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

print('Libraries imported.')

%load_ext sql
%matplotlib inline

# Enable IBM_db2 Database Access
%sql ibm_db_sa://trj52137:z89xgm4f30nmqh%5Eg@dashdb-txn-sbox-yp-dal09-08.services.dal.bluemix.net:50000/BLUDB

print('Hello Capstone Project Course!')

Libraries imported.
The sql extension is already loaded. To reload it, use:
  %reload_ext sql
Hello Capstone Project Course!


In [220]:
# Importing Cook County Housing Sales Data using API provided by Cook County

# make sure to install these packages before running:
# pip install pandas
# !pip install sodapy

import pandas as pd
from sodapy import Socrata

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("datacatalog.cookcountyil.gov", None)

# Example authenticated client (needed for non-public datasets):
# client = Socrata(datacatalog.cookcountyil.gov,
#                  MyAppToken,
#                  userame="user@example.com",
#                  password="AFakePassword")

# First 2000 results, returned as JSON from API / converted to Python list of
# dictionaries by sodapy.
results = client.get("5pge-nu6u", limit=2000)

# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)
sale_2018 = results_df[results_df['sale_year']=='2018']
sale_2018 = sale_2018.reset_index(drop = True)
# sale_2018.head()
house_location = pd.DataFrame(columns = ['address','latitude','longitude'])
house_location['address'] = sale_2018['addr']



In [221]:
# # Testing for the code in the loop below
# address = house_location['address'].iloc[0] + ",Chicago, IL, USA"
# location = geolocator.geocode(address)
# print(location)

In [222]:
# Getting coordinations for each house sold in 2018
house_latitudes = []
house_longitudes = []
geolocator = Nominatim(user_agent="house_explorer")

for i in range(house_adrs.shape[0]):
    address = house_location['address'].iloc[i] + ", Chicago, IL, USA"
    location = geolocator.geocode(address)
    if location == None:
        house_latitudes.append(np.nan)
        house_longitudes.append(np.nan)
    else:
        house_latitudes.append(location.latitude)
        house_longitudes.append(location.longitude)
    
house_location['latitude'] = house_latitudes
house_location['longitude'] = house_longitudes

In [223]:
house_location.head()

Unnamed: 0,address,latitude,longitude
0,4431 PRESCOTT AVE,,
1,5313 N DELPHIA AVE,41.975871,-87.841434
2,2221 N FREMONT ST,41.922454,-87.650801
3,4446 N PARKSIDE AVE,41.961867,-87.768828
4,1926 PRAIRIE SQUARE,,


In [226]:
# Dropping addresses not in Chicago
house_loc_filtered = house_location.dropna().reset_index(drop=True)
print(house_loc_filtered.shape)
house_loc_filtered.head()

(219, 3)


Unnamed: 0,address,latitude,longitude
0,5313 N DELPHIA AVE,41.975871,-87.841434
1,2221 N FREMONT ST,41.922454,-87.650801
2,4446 N PARKSIDE AVE,41.961867,-87.768828
3,6964 W DIVERSEY AVE,41.931001,-87.800237
4,2336 N COMMONWEALTH AVE,41.924814,-87.638585


In [238]:
house_loc_filtered.to_csv('data/Chicago_address.csv', index = False)

In [235]:
# This function requires a latitude, a longitude, and a loaded geoJSON file as inputs 
def comm_finder(lat, lng, js):
    # construct point based on lon/lat returned by geocoder
    point = Point(lng, lat)  # Point class from shapely library

    # check each polygon to see if it contains the point
    for feature in js['features']:
        polygon = shape(feature['geometry']) # shape method from shapely library
        if polygon.contains(point):
            return (feature['properties']["community"],feature['properties']["area_numbe"])
        else:
            return (np.nan,np.nan)

In [233]:
# load GeoJSON file containing sectors
with open('data/Boundaries_Community_Areas.geojson') as f:
    js = json.load(f)

community_name = []
community_code = []

for lat, lng in zip(house_loc_filtered['latitude'],house_loc_filtered['longitude']):
    cmm, cmm_num = comm_finder(lat, lng, js)
    community_name.append(cmm)
    community_name.append(cmm_num)

### 1. Finding Best Community Areas Based on Census Data

#### 1.1 Load Data and Find Top 10 Community Areas

In [249]:
# Getting Census Data form IBM_DB2 SQL Database
nbhd_hardship = %sql SELECT hardship_index, community_area_name, community_area_number, percent_aged_25__without_high_school_diploma, per_capita_income FROM CENSUS_DATA ORDER BY hardship_index NULLS LAST;
columns = ['hardship_index', 'community_area_name', 'community_area_number', 'percent_aged_25__without_high_school_diploma', 'per_capita_income']
nbhd_hardship = pd.DataFrame(nbhd_hardship, columns = columns)

# extract chicago's average on census data (not including hardship_index)
chicago_avg = nbhd_hardship[nbhd_hardship['hardship_index'].isnull()]

# print(nbhd_hardship.shape) 
nbhd_hardship.dropna(axis = 0, subset = ['community_area_number'],inplace = True)
# print(nbhd_hardship.shape)
nbhd_hardship['community_area_number'] = nbhd_hardship['community_area_number'].astype(int)

 * ibm_db_sa://trj52137:***@dashdb-txn-sbox-yp-dal09-08.services.dal.bluemix.net:50000/BLUDB
Done.


In [250]:
nbhd_hardship.head() 
# contains all community areas
# can be combined with all other fators for machine learning algorithm'use

Unnamed: 0,hardship_index,community_area_name,community_area_number,percent_aged_25__without_high_school_diploma,per_capita_income
0,1.0,Near North Side,8,2.5,88669
1,2.0,Lincoln Park,7,3.6,71551
2,3.0,Loop,32,3.1,65526
3,5.0,Lake View,6,2.6,60058
4,6.0,North Center,5,4.5,57123


#### 1.2 Finding coordinates for top 10 Communities

In [251]:
nbhd_rank_hd = nbhd_hardship.head(10)

In [252]:
# Getting coordinations for each neighborhood
latitudes = []
longitudes = []
geolocator = Nominatim(user_agent="nb_explorer")

for i in range(nbhd_rank_hd.shape[0]):
    address = nbhd_rank_hd['community_area_name'].iloc[i] + ",Chicago, IL, USA"
    location = geolocator.geocode(address)
    latitudes.append(location.latitude)
    longitudes.append(location.longitude)
    
nbhd_rank_hd['latitude'] = latitudes
nbhd_rank_hd['longitude'] = longitudes

nbhd_rank_hd.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


Unnamed: 0,hardship_index,community_area_name,community_area_number,percent_aged_25__without_high_school_diploma,per_capita_income,latitude,longitude
0,1.0,Near North Side,8,2.5,88669,41.900033,-87.634497
1,2.0,Lincoln Park,7,3.6,71551,41.940298,-87.638117
2,3.0,Loop,32,3.1,65526,41.881609,-87.629457
3,5.0,Lake View,6,2.6,60058,41.94705,-87.655429
4,6.0,North Center,5,4.5,57123,41.956107,-87.67916


#### 1.3 Showing Top 10 Community Areas on Maps

In [253]:
# Getting coordinations for Chicago
address = 'Chicago, IL, USA'

geolocator = Nominatim(user_agent="Chica_explorer")
location = geolocator.geocode(address)
chica_latitude = location.latitude
Chica_longitude = location.longitude
print('The geograpical coordinate of Chicago City are {}, {}.'.format(chica_latitude, Chica_longitude))

The geograpical coordinate of Chicago City are 41.8755616, -87.6244212.


In [338]:
map_chicago = folium.Map(location=[chica_latitude, Chica_longitude], zoom_start=10)

# add markers to map
for lat, lng, hd_index, neighborhood in zip(nbhd_rank_hd['latitude'], nbhd_rank_hd['longitude'], nbhd_rank_hd['hardship_index'], nbhd_rank_hd['community_area_name']):
    label = '{}, \nHardship:{}'.format(neighborhood, hd_index)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_chicago)  
    
map_chicago # showing neighborhoods on map

### 2.Finding Best Community Areas Based on College Enrollment Rate

#### 2.1 Load Public School Data from Local Data Folder

In [327]:
# Getting Census Data form local data folder. Here we only load high schools.
nbhd_schools = pd.read_csv('data/Chicago_Public_Schools_-_Progress_Report_Cards__2011-2012-v3.csv')

# Extract schools with College Enrollment Rate info, and put them in descending order.
nbhd_schools = nbhd_schools[nbhd_schools['Elementary, Middle, or High School'] == 'HS']
nbhd_schools = nbhd_schools[nbhd_schools['College Enrollment Rate %'] !='NDA'].sort_values(by='College Enrollment Rate %', ascending = False)
nbhd_schools = nbhd_schools.reset_index(drop = True)
nbhd_schools.shape

(77, 78)

In [328]:
# Extract columns which we are interested in
columns = ['School ID', 'NAME_OF_SCHOOL',
           'College Enrollment Rate %', 
           'COMMUNITY_AREA_NUMBER', 'COMMUNITY_AREA_NAME',
           'Latitude', 'Longitude']
nbhd_schools_filtered = nbhd_schools[columns]

#### 2.2 Add School Locations to Chicago Map

In [339]:
# Finding Ttop 10 Schools
nbhd_schools_rank = nbhd_schools_filtered.head(10)

# add markers to map
for lat, lng, enrl_rate, neighborhood in zip(nbhd_schools_rank['Latitude'], nbhd_schools_rank['Longitude'], nbhd_schools_rank['College Enrollment Rate %'], nbhd_schools_rank['COMMUNITY_AREA_NAME']):
    label = '{}, \nCollege Enrollment Rate: {}%'.format(neighborhood, enrl_rate)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='orange',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_chicago)  
    
map_chicago # showing neighborhoods on map

### 3. Finding the Saftest Community Area

In [346]:
nbhd_safety = pd.read_csv('data/Chicago_Crime_Data-v2.csv')
nbhd_safety = nbhd_safety.dropna(subset = ['COMMUNITY_AREA_NUMBER'])
nbhd_safety['COMMUNITY_AREA_NUMBER'] = nbhd_safety['COMMUNITY_AREA_NUMBER'].astype('int')
nbhd_safety.head()

Unnamed: 0,ID,CASE_NUMBER,DATE,BLOCK,IUCR,PRIMARY_TYPE,DESCRIPTION,LOCATION_DESCRIPTION,ARREST,DOMESTIC,BEAT,DISTRICT,WARD,COMMUNITY_AREA_NUMBER,FBICODE,X_COORDINATE,Y_COORDINATE,YEAR,UPDATEDON,LATITUDE,LONGITUDE,LOCATION
0,3512276,HK587712,08/28/2004 05:50:56 PM,047XX S KEDZIE AVE,890,THEFT,FROM BUILDING,SMALL RETAIL STORE,False,False,911,9,14.0,58,6,1155838.0,1873050.0,2004,02/10/2018 03:50:01 PM,41.807441,-87.703956,"(41.8074405, -87.703955849)"
1,3406613,HK456306,06/26/2004 12:40:00 PM,009XX N CENTRAL PARK AVE,820,THEFT,$500 AND UNDER,OTHER,False,False,1112,11,27.0,23,6,1152206.0,1906127.0,2004,02/28/2018 03:56:25 PM,41.89828,-87.716406,"(41.898279962, -87.716405505)"
2,8002131,HT233595,04/04/2011 05:45:00 AM,043XX S WABASH AVE,820,THEFT,$500 AND UNDER,NURSING HOME/RETIREMENT HOME,False,False,221,2,3.0,38,6,1177436.0,1876313.0,2011,02/10/2018 03:50:01 PM,41.815933,-87.624642,"(41.815933131, -87.624642127)"
3,7903289,HT133522,12/30/2010 04:30:00 PM,083XX S KINGSTON AVE,840,THEFT,FINANCIAL ID THEFT: OVER $300,RESIDENCE,False,False,423,4,7.0,46,6,1194622.0,1850125.0,2010,02/10/2018 03:50:01 PM,41.743665,-87.562463,"(41.743665322, -87.562462756)"
4,10402076,HZ138551,02/02/2016 07:30:00 PM,033XX W 66TH ST,820,THEFT,$500 AND UNDER,ALLEY,False,False,831,8,15.0,66,6,1155240.0,1860661.0,2016,02/10/2018 03:50:01 PM,41.773455,-87.70648,"(41.773455295, -87.706480471)"
