## Imports

In [1]:
import googlemaps
import pandas as pd
import numpy as np
import re
import os
import geopandas as gpd
import folium
import requests
from bs4 import BeautifulSoup
from shapely.geometry import Point

## Read-in

In [2]:
df = pd.read_csv('LISF_May_June_July.csv')

In [6]:
len(df)

2835

In [7]:
# Define the regex pattern to split the text
pattern = r'(\d+-\d+-\d+-\d+-\d+)\s(.*)'

# Apply regex and split the text into two columns
df[['PIN', 'Address']] = df['1st PIN'].str.extract(pattern)

# Remove leading/trailing whitespace from the address column
df['Address'] = df['Address'].str.strip()

In [8]:
df = df.drop(columns=['Unnamed: 0.2', 'Unnamed: 0.1', 'Unnamed: 0','1st PIN'])

## Clean, drop, and convert

In [9]:
df['Doc Recorded'] = pd.to_datetime(df['Doc Recorded'])

In [17]:
df = df.drop_duplicates()

## Data Stuff

In [18]:
first_month = 5
second_month = 6
third_month = 7

In [19]:
# Count number of earliest month dates
may_dates_count = len(df.loc[df['Doc Recorded'].dt.month == first_month])

print(f'Number of May dates: {may_dates_count}')

Number of May dates: 908


In [20]:
# Count number of middle month dates
june_dates_count = len(df.loc[df['Doc Recorded'].dt.month == second_month])

print(f'Number of June dates: {june_dates_count}')

Number of June dates: 935


In [21]:
# Count number of middle month dates
july_dates_count = len(df.loc[df['Doc Recorded'].dt.month == third_month])

print(f'Number of July dates: {july_dates_count}')

Number of July dates: 692


In [22]:
df['1st Grantor'] = df['1st Grantor'].fillna('NA')
df['1st Grantee'] = df['1st Grantee'].fillna('NA')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['1st Grantor'] = df['1st Grantor'].fillna('NA')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['1st Grantee'] = df['1st Grantee'].fillna('NA')


In [23]:
municipal_authority = ['city', 'town', 'municipality', 'village','transit auth','department of transp']  # list of municipal authority keywords
df = df[~df['1st Grantor'].str.contains('|'.join(municipal_authority), case=False) & 
        ~df['1st Grantee'].str.contains('|'.join(municipal_authority), case=False)]

In [24]:
# create new column and assign colors based on month
df['COLOR'] = df['Doc Recorded'].apply(lambda x: 'orange' if x.month == first_month else 'red' if x.month == second_month else 'blue' if x.month == third_month else '')

## Get mortgage amounts

In [25]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'TE': 'Trailers'
}

In [26]:
def mortgage_url_snagger(URL, headers):
    response = requests.get(URL, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    for link in soup.find_all('a', href=True):
        if link['href'].startswith('/Document/Detail'):
            mortgage_url = 'https://crs.cookcountyclerkil.gov' + link['href']
            return mortgage_url

In [27]:
def mortgage_consi_snagger(URL, headers):
    response = requests.get(URL, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    # Find all tr tags
    trs = soup.find_all('tr')

    # Loop through each tr tag and look for the td tag containing 'Consideration Amount' label
    for tr in trs:
        td = tr.find('th', text='Consideration Amount:')
        if td:
            # If the td tag is found, get the next td tag containing the amount
            amount_td = td.find_next_sibling('td')
            if amount_td:
                # Print the amount
                amount = amount_td.text.strip()
                return amount
            else:
                return 'not found'

In [28]:
df['mortgage_urls'] = df['deed_urls'].apply(lambda x: mortgage_url_snagger(x, headers))

In [29]:
df['mortgage_amount'] = df['mortgage_urls'].apply(lambda x: mortgage_consi_snagger(x, headers) if x is not None else None)

  td = tr.find('th', text='Consideration Amount:')


In [30]:
df[['mortgage_urls','mortgage_amount']] = df[['mortgage_urls','mortgage_amount']].fillna('NA')

## Geocode

In [31]:
df['geo_address'] = df['Address'] + ' Cook County, IL'

In [32]:
len(df)

2108

In [34]:
%store -r google_maps_API_Key
gmaps_key = googlemaps.Client(key=google_maps_API_Key)

In [35]:
# Define the geocode function
def geocode(add):
    g = gmaps_key.geocode(add)
    if g:
        lat = g[0]["geometry"]["location"]["lat"]
        lng = g[0]["geometry"]["location"]["lng"]
        return (lat, lng)
    else:
        return None

# Apply geocoding to the 'geo_address' column and store the results in 'geocoded' column
df['geocoded'] = df['geo_address'].apply(geocode)

In [36]:
df['geocoded'] = df['geocoded'].astype(str)
df[['lat', 'lon']] = df['geocoded'].apply(lambda x: (None, None) if x == 'None' else x.strip('()').split(', ', 1)).apply(pd.Series)
df['lat'] = df['lat'].astype(float)
df['lon'] = df['lon'].astype(float)

## HTML Popup Formatter

In [37]:
df.columns

Index(['View Doc', 'Doc Number', 'Doc Recorded', 'Doc Executed', 'Doc Type',
       'Consi. Amt.', '1st Grantor', '1st Grantee', 'Assoc. Doc#', 'deed_urls',
       'PIN', 'Address', 'COLOR', 'mortgage_urls', 'mortgage_amount',
       'geo_address', 'geocoded', 'lat', 'lon'],
      dtype='object')

In [38]:
def popup_html(row):
    grantor = row['1st Grantor']
    grantee = row['1st Grantee']
    PIN = row['PIN']
    Address = row['Address']
    mortgage_amount = row['mortgage_amount']
    
    html = '''<!DOCTYPE html>
    <html>
    <strong>Lender: </strong>{}'''.format(grantor) + '''<br>
    <strong>Borrower: </strong>{}'''.format(grantee) + '''<br>
    <strong>PIN: </strong>{}'''.format(PIN) + '''<br>
    <strong>Address: </strong>{}'''.format(Address) + '''<br>
    <strong>Mortgage Amount: </strong>{}'''.format(mortgage_amount) + '''<br>
    </html>
    '''
    return html

In [41]:
import folium
from folium.plugins import MarkerCluster
import numpy as np

m = folium.Map(location=df[["lat", "lon"]].mean().to_list(), zoom_start=10)

title_html = '''
              <h3 align="center" style="font-size:16px"><b>{}</b></h3>
             '''.format(f'Cook County Pending Foreclosures')

caption_html = '''
                <p align="center" style="vertical-align: bottom; font-size:13px"><i>{}</i></p>
                '''.format('May, June and July')

### Create map container ###
m = folium.Map(location=df[["lat", "lon"]].mean().to_list(), zoom_start=9.5, tiles=None)

# Create two FeatureGroups for different color pins
fg_orange = folium.FeatureGroup(name='May') ## 
fg_red = folium.FeatureGroup(name='June') ##
fg_blue = folium.FeatureGroup(name='July')

for index, row in df.iterrows():
    lat = row['lat']
    lon = row['lon']
    color = row['COLOR']
    if pd.notnull(lat) and pd.notnull(lon) and color == 'orange':
        marker = folium.CircleMarker(
            location=[lat, lon],
            radius=10,
            fill=True,
            color=color,
            popup=folium.Popup(popup_html(row), max_width=400))
        marker.add_to(fg_orange)
    elif pd.notnull(lat) and pd.notnull(lon) and color == 'red':
        marker = folium.CircleMarker(
            location=[lat,lon],
            raidus=5,
            fill=True,
            color=color,
            popup=folium.Popup(popup_html(row), max_width=400))
        marker.add_to(fg_red)
    elif pd.notnull(lat) and pd.notnull(lon) and color == 'blue':
        marker = folium.CircleMarker(
            location=[lat,lon],
            raidus=5,
            fill=True,
            color=color,
            popup=folium.Popup(popup_html(row), max_width=400))
        marker.add_to(fg_blue)
    else:
        continue

# Add the FeatureGroups to the map
fg_orange.add_to(m)
fg_red.add_to(m)
fg_blue.add_to(m)

folium.TileLayer('OpenStreetMap', control=False).add_to(m)

# Add LayerControl to the map
folium.map.LayerControl(collapsed=False).add_to(m)
m.get_root().html.add_child(folium.Element(title_html))
m.get_root().html.add_child(folium.Element(caption_html))
folium.TileLayer('CartoDBpositron', control=False).add_to(m) 
            
# Display map
# m

<folium.raster_layers.TileLayer at 0x7f7b5e654a00>

In [42]:
m.save('index.html')

## Map URL Snagger

In [43]:
base_name = 'https://trd-digital.github.io/trd-news-interactive-maps/'

cwd = os.getcwd()

cwd = cwd.split('/')

final_name = base_name + cwd[-1]
print(final_name)

https://trd-digital.github.io/trd-news-interactive-maps/lis_pendens_scraper_may_june_july


## Stats for story

In [48]:
df['mortgage_amount_int'] = df['mortgage_amount'].str.replace('$','')
df['mortgage_amount_int'] = df['mortgage_amount_int'].str.replace(',','')
df = df.loc[df['mortgage_amount_int'] != 'NA']
df['mortgage_amount_int'] = df['mortgage_amount_int'].astype(float).fillna(0).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mortgage_amount_int'] = df['mortgage_amount_int'].astype(float).fillna(0).astype(int)


In [49]:
# Count number of first month dates
may_dates_count = len(df.loc[df['Doc Recorded'].dt.month == first_month])

print(f'Number of May dates: {may_dates_count}')

# Count number of second month dates
june_dates_count = len(df.loc[df['Doc Recorded'].dt.month == second_month])

print(f'Number of June dates: {june_dates_count}')

# Count number of third month dates
july_dates_count = len(df.loc[df['Doc Recorded'].dt.month == third_month])

print(f'Number of July dates: {july_dates_count}')

Number of May dates: 675
Number of June dates: 747
Number of July dates: 504


In [90]:
may = df.loc[df['Doc Recorded'].dt.month == first_month]
june = df.loc[df['Doc Recorded'].dt.month == second_month]
july = df.loc[df['Doc Recorded'].dt.month == third_month]

In [91]:
may.reset_index(inplace=True, drop=True)
june.reset_index(inplace=True, drop=True)
july.reset_index(inplace=True, drop=True)

In [106]:
# may.sort_values(by='mortgage_amount_int',ascending=False).head(1)
# june.sort_values(by='mortgage_amount_int',ascending=False).head(1)
july.sort_values(by='mortgage_amount_int',ascending=False).head(1)

Unnamed: 0,View Doc,Doc Number,Doc Recorded,Doc Executed,Doc Type,Consi. Amt.,1st Grantor,1st Grantee,Assoc. Doc#,deed_urls,PIN,Address,COLOR,mortgage_urls,mortgage_amount,geo_address,geocoded,lat,lon,mortgage_amount_int
204,View,2321257024,2023-07-31,7/25/2023,LIS PENDENS FORECLOSURE,,WILMINGTON TRUST NATL ASSN TR,ADVENTUS US RLTY 12 LP,1630834000.0,https://crs.cookcountyclerkil.gov/Document/Det...,12-03-103-010-0000,"9377 W HIGGINS RD, ROSEMONT",blue,https://crs.cookcountyclerkil.gov/Document/Det...,"$128,000,000.00","9377 W HIGGINS RD, ROSEMONT Cook County, IL","(41.9883205, -87.8567961)",41.988321,-87.856796,128000000


In [109]:
# may.iloc[78]
# print(may['mortgage_urls'].iloc[78])
# june.iloc[229]
# print(june['deed_urls'].iloc[229])
# july.iloc[204]
print(july['mortgage_urls'].iloc[204])

https://crs.cookcountyclerkil.gov/Document/Detail?dId=MzU0NDc5NTk1&hId=Yzg1NWZhZmZjYTJiYzM3MTVhODQxNDdjM2Y3MTc4Yjk5MTNjOTY3ODhlYWU2NTM1YjYzMDVkZjM1NTY5OTRjNQ2


In [52]:
print(f'May: ${may.mortgage_amount_int.sum():,}')
print('-------')
print(f'June: ${june.mortgage_amount_int.sum():,}')
print('-------')
print(f'July: ${july.mortgage_amount_int.sum():,}')
print('-------')
print(f'Total: ${may.mortgage_amount_int.sum() + june.mortgage_amount_int.sum() + july.mortgage_amount_int.sum():,}')

May: $231,566,233
-------
June: $157,133,849
-------
July: $248,721,589
-------
Total: $637,421,671


In [79]:
## July
33 + 27 + 22

82

In [78]:
july['1st Grantor'].value_counts().head(60)

1st Grantor
US BK TRUST NATL ASSN TR                  33
US BK NATL ASSN                           27
MIDFIRST BK                               23
NATIONSTAR MTG LLC                        23
US BK NATL ASSN TR                        22
FREEDOM MTG CORP                          22
LAKEVIEW LOAN SERVICING LLC               22
CARRINGTON MTG SERVICES LLC               18
WILMINGTON SAV FUND SOC FSB TR            17
WELLS FARGO BK NA                         12
DEUTSCHE BK NATL TRUST CO TR              12
NEWREZ LLC                                10
PENNYMAC LOAN SERVICES LLC                10
JPMORGAN CHASE BK NATL ASSN               10
CITIMORTGAGE INC                          10
FIFTH THIRD BK NATL ASSN                  10
PNC BK NATL ASSN                           9
FEDERAL HOME LOAN MTG CORP TR              9
ROCKET MTG LLC                             9
SPECIALIZED LOAN SERVICING LLC             7
HSBC BK USA NATL ASSN TR                   7
THE BK OF NEW YORK MELLON TR               

In [56]:
july['1st Grantee'].value_counts()

1st Grantee
LIFTUP COMMUNITIES LLC CHICAGO    6
CHICAGO TITLE LAND TRUST CO TR    4
PVONE PROP LLC                    2
BEN JASON J                       2
CHICAGO REHABBING LLC             2
                                 ..
DUNCAN CANESSA                    1
WATKINS INGRID                    1
COMPTON MARY                      1
KHAN MUHAMMAD                     1
OWENS ROSCOLITTA                  1
Name: count, Length: 491, dtype: int64

In [57]:
df.to_csv('may_june_july_foreclosures.csv')

In [None]:
# apr.sort_values(by='mortgage_amount_int',ascending=False)

In [62]:
print(f"May median mortgage foreclosure amount: ${may['mortgage_amount_int'].median():,}")
print('---------')
print(f"June median mortgage foreclosure amount: ${june['mortgage_amount_int'].median():,}")
print('---------')
print(f"July median mortgage foreclosure amount: ${july['mortgage_amount_int'].median():,}")

May median mortgage foreclosure amount: $172,000.0
---------
June median mortgage foreclosure amount: $166,500.0
---------
July median mortgage foreclosure amount: $161,947.5


## Boundary Analysis

In [63]:
boundaries = gpd.read_file('Boundaries - Neighborhoods.geojson')

In [66]:
# create Point objects from lat/lon columns in df
geometry = gpd.points_from_xy(may['lon'], may['lat'])

# create geodataframe from df with Point objects as geometry
gdf_filings_may = gpd.GeoDataFrame(may, geometry=geometry)

gdf_filings_may.crs = 'EPSG:4326'
gdf_filings_may = gdf_filings_may.to_crs(boundaries.crs)

# use contains method to get count of points within each neighborhood
counts_may = gpd.sjoin(gdf_filings_may, boundaries, predicate='within').groupby('pri_neigh').size().reset_index(name='count')

In [67]:
# create Point objects from lat/lon columns in df
geometry = gpd.points_from_xy(june['lon'], june['lat'])

# create geodataframe from df with Point objects as geometry
gdf_filings_june = gpd.GeoDataFrame(june, geometry=geometry)

gdf_filings_june.crs = 'EPSG:4326'
gdf_filings_june = gdf_filings_june.to_crs(boundaries.crs)

# use contains method to get count of points within each neighborhood
counts_june = gpd.sjoin(gdf_filings_june, boundaries, predicate='within').groupby('pri_neigh').size().reset_index(name='count')

In [68]:
# create Point objects from lat/lon columns in df
geometry = gpd.points_from_xy(july['lon'], july['lat'])

# create geodataframe from df with Point objects as geometry
gdf_filings_july = gpd.GeoDataFrame(july, geometry=geometry)

gdf_filings_july.crs = 'EPSG:4326'
gdf_filings_july = gdf_filings_july.to_crs(boundaries.crs)

# use contains method to get count of points within each neighborhood
counts_july = gpd.sjoin(gdf_filings_july, boundaries, predicate='within').groupby('pri_neigh').size().reset_index(name='count')

In [69]:
counts_may

Unnamed: 0,pri_neigh,count
0,Albany Park,4
1,Andersonville,1
2,Archer Heights,2
3,Ashburn,7
4,Auburn Gresham,16
...,...,...
68,West Lawn,2
69,West Loop,2
70,West Pullman,13
71,West Ridge,8


In [70]:
counts_june

Unnamed: 0,pri_neigh,count
0,Albany Park,1
1,Andersonville,1
2,Archer Heights,4
3,Ashburn,15
4,Auburn Gresham,19
...,...,...
68,West Pullman,7
69,West Ridge,4
70,West Town,2
71,Wicker Park,2


In [71]:
counts_july

Unnamed: 0,pri_neigh,count
0,Albany Park,2
1,Armour Square,1
2,Ashburn,12
3,Auburn Gresham,11
4,Austin,16
...,...,...
63,West Pullman,8
64,West Ridge,5
65,West Town,1
66,Woodlawn,4
