In [1]:
import requests
import lxml.html as lh
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re

In [490]:
#url='https://www.houstontx.gov/police/news.htm'
url='https://www.houstontx.gov/police/nr/2019/dec/index.htm'
#Create a handle, page, to handle the contents of the website
page = requests.get(url)
#Store the contents of the website under doc
doc = lh.fromstring(page.content)
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [491]:
soup = BeautifulSoup(page.text, 'lxml')
gdp = soup.find_all("table", attrs={"class": "default"})
print("Number of tables on site: ",len(gdp))


Number of tables on site:  1


In [496]:
table1 = gdp[0]
body = table1.find_all("tr")
# Head values (Column names) are the first items of the body list
head = body[0] # 0th item is the header row
body_rows = body[1:] # All other items becomes the rest of the rows

# Lets now iterate through the head HTML code and make list of clean headings

# Declare empty list to keep Columns names
headings = []
for item in head.find_all("th"): # loop through all th elements
    # convert the th elements to text and strip "\n"
    item = (item.text).rstrip("\n")
    # append the clean column name to headings
    headings.append(item)
print(headings)
# Next is now to loop though the rest of the rows

#print(body_rows[0])
all_rows = [] # will be a list for list for all rows
for row_num in range(len(body_rows)): # A row at a time
    row = [] # this will old entries for one row
    for row_item in body_rows[row_num].find_all("td"): #loop through all row entries
        # row_item.text removes the tags from the entries
        # the following regex is to remove \xa0 and \n and comma from row_item.text
        # xa0 encodes the flag, \n is the newline and comma separates thousands in numbers
        aa = re.sub("(\xa0)|(\n)|,","",row_item.text)
        #append aa to row - note one row entry is being appended
        row.append(aa)
    # append one row to all_rows
    all_rows.append(row)
df = pd.DataFrame(data=all_rows,columns=['data','event'])
df.head()

['\nDecember 2019']


Unnamed: 0,data,event
0,12-31-19,Investigation Into Fatal Shooting At 9601 West...
1,12-31-19,Investigation Into Two Women Found Deceased At...
2,12-31-19,Investigation Into Fatal Crash At 6100 West Li...
3,12-30-19,Investigation Into Fatal Shooting At 5021 Long...
4,12-30-19,Investigation Into Fatal Shooting At 1000 Cros...


In [497]:
df.shape

(102, 2)

In [498]:
# Filter out other types of events cause we only focus on fatal shooting and crash.
df = df[df['event'].str.contains("Fatal")]
df.shape

(58, 2)

In [499]:
df['address'] = np.nan
for i in range(len(df)):
    df.iloc[i,2] = ''.join(re.findall('(?<=At ).*$',df.iloc[i,1])) + ", Houston, TX, USA" 
df.head()

Unnamed: 0,data,event,address
0,12-31-19,Investigation Into Fatal Shooting At 9601 West...,"9601 West Montgomery Road, Houston, TX, USA"
2,12-31-19,Investigation Into Fatal Crash At 6100 West Li...,"6100 West Little York Road, Houston, TX, USA"
3,12-30-19,Investigation Into Fatal Shooting At 5021 Long...,"5021 Longmeadow Street, Houston, TX, USA"
4,12-30-19,Investigation Into Fatal Shooting At 1000 Cros...,"1000 Crosby Street, Houston, TX, USA"
7,12-30-19,Investigation Into Fatal Crash At 8200 Telepho...,"8200 Telephone Road, Houston, TX, USA"


In [455]:
df['event_type'] = "Fatal Crash"
df.iloc[df['event'].str.contains("Shooting"),3] = "Fatal Shooting"
df.head()

Unnamed: 0,data,event,address,event_type
0,01-31-20,Investigation into Fatal Crash at 610 Sunnyside,"610 Sunnyside, Houston, TX, USA",Fatal Crash
1,01-31-20,Investigation into Fatal Shooting at 9202 Nath...,"9202 Nathaniel Street, Houston, TX, USA",Fatal Shooting
2,01-31-20,Investigation into Fatal Shooting at 800 South...,"800 South Wayside Drive, Houston, TX, USA",Fatal Shooting
3,01-31-20,Investigation into Fatal Shooting at 9002 Ster...,"9002 Sterlingshire Street, Houston, TX, USA",Fatal Shooting
5,01-31-20,Investigation into Fatal Shooting at 3600 Yose...,"3600 Yosemite Street, Houston, TX, USA",Fatal Shooting


In [456]:
df.to_csv(r"C:\Users\Fate\Downloads\ML\HPD\Dec2019_address.csv",index=False)

In [18]:
import glob
import os
os.chdir(r"C:\Users\Fate\Downloads\ML\HPD")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
all_filenames

['Apr2020_address.csv',
 'Apr2021_address.csv',
 'Apr2022_address.csv',
 'Aug2020_address.csv',
 'Aug2021_address.csv',
 'Aug2022_address.csv',
 'Dec2020_address.csv',
 'Dec2021_address.csv',
 'Feb2020_address.csv',
 'Feb2021_address.csv',
 'Feb2022_address.csv',
 'Jan2020_address.csv',
 'Jan2021_address.csv',
 'Jan2022_address.csv',
 'Jul2020_address.csv',
 'Jul2021_address.csv',
 'Jul2022_address.csv',
 'Jun2020_address.csv',
 'Jun2021_address.csv',
 'Jun2022_address.csv',
 'Mar2020_address.csv',
 'Mar2021_address.csv',
 'Mar2022_address.csv',
 'May2020_address.csv',
 'May2021_address.csv',
 'May2022_address.csv',
 'Nov2020_address.csv',
 'Nov2021_address.csv',
 'Oct2020_address.csv',
 'Oct2021_address.csv',
 'Oct2022_address.csv',
 'Sep2020_address.csv',
 'Sep2021_address.csv',
 'Sep2022_address.csv']

In [19]:
#combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
#export to csv
combined_csv.to_csv( "combined.csv", index=False, encoding='utf-8-sig')

In [22]:
df = pd.read_csv(r"C:\Users\Fate\Downloads\ML\HPD\combined.csv")
df = df.iloc[:,1:]
df.head(30)

Unnamed: 0,data,event,address,event_type
0,04-30-20,Investigation into Fatal Crash at 5800 Eastex ...,"5800 Eastex Freeway, Houston, TX, USA",Fatal Crash
1,04-28-20,Investigation into Fatal Crash at 4300 Telepho...,"4300 Telephone Road, Houston, TX, USA",Fatal Crash
2,04-28-20,Investigation into Fatal Shooting at 3826 Seab...,"3826 Seabrook Street, Houston, TX, USA",Fatal Shooting
3,04-27-20,Investigation into Fatal Shooting at 8301 Darl...,"8301 Darlington Drive, Houston, TX, USA",Fatal Shooting
4,04-27-20,Investigation into Fatal Crash at 11100 Almeda...,"11100 Almeda Road, Houston, TX, USA",Fatal Crash
5,04-27-20,Investigation into Fatal Shooting at 10615 Mea...,"10615 Meadowglen Lane, Houston, TX, USA",Fatal Shooting
6,04-24-20,Investigation into Fatal Shooting at 4600 Whit...,"4600 White Rock Street, Houston, TX, USA",Fatal Shooting
7,04-24-20,Investigation into Fatal Shooting at 4714 Ward...,"4714 Ward Street, Houston, TX, USA",Fatal Shooting
8,04-24-20,Investigation into Fatal Shooting at 8900 Sout...,"8900 South Braeswood Boulevard, Houston, TX, USA",Fatal Shooting
9,04-23-20,Investigation into Fatal Crash at 6000 Nunn St...,"6000 Nunn Street, Houston, TX, USA",Fatal Crash


In [23]:
df.shape

(1513, 4)

In [2]:
import geopy
import geopandas
from geopy.geocoders import GoogleV3,Nominatim
locator = Nominatim(user_agent="myGeocoder")

geolocator = GoogleV3(api_key = "AIzaSyDYRmuTlApwfH9a699U6z82AaptC93M3FQ")

In [25]:
location = geolocator.geocode("4139 Regal Stone Ln, Sugar Land, USA")
print("Latitude = {}, Longitude = {}".format(location.latitude, location.longitude))

Latitude = 29.544439, Longitude = -95.5702195


In [26]:
latitude=[]
longitude=[]
for address in df['address'][:200]:
    latitude.append(geolocator.geocode(address).latitude)
    longitude.append(geolocator.geocode(address).longitude)
print(len(latitude))
# df['latitude'] = latitude
# df['longitude'] = longitude
# df.head()

200


In [27]:
df1 = df.iloc[:200,]
df1['latitude'] = latitude
df1['longitude'] = longitude
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,data,event,address,event_type,latitude,longitude
0,04-30-20,Investigation into Fatal Crash at 5800 Eastex ...,"5800 Eastex Freeway, Houston, TX, USA",Fatal Crash,29.812368,-95.331982
1,04-28-20,Investigation into Fatal Crash at 4300 Telepho...,"4300 Telephone Road, Houston, TX, USA",Fatal Crash,29.70293,-95.304678
2,04-28-20,Investigation into Fatal Shooting at 3826 Seab...,"3826 Seabrook Street, Houston, TX, USA",Fatal Shooting,29.684594,-95.366678
3,04-27-20,Investigation into Fatal Shooting at 8301 Darl...,"8301 Darlington Drive, Houston, TX, USA",Fatal Shooting,29.834904,-95.275063
4,04-27-20,Investigation into Fatal Crash at 11100 Almeda...,"11100 Almeda Road, Houston, TX, USA",Fatal Crash,29.66269,-95.400054


In [28]:
df2 = df.iloc[200:400,]
latitude=[]
longitude=[]
for address in df2['address']:
    latitude.append(geolocator.geocode(address).latitude)
    longitude.append(geolocator.geocode(address).longitude)
print(len(latitude))

200


In [29]:
df2['latitude'] = latitude
df2['longitude'] = longitude
df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,data,event,address,event_type,latitude,longitude
200,8-17-21,Investigation into Fatal Shooting at 9551 Fann...,"9551 Fannin Street, Houston, TX, USA",Fatal Shooting,29.664159,-95.401858
201,8-16-21,Investigation into Fatal Shooting at 9602 Jens...,"9602 Jensen Drive, Houston, TX, USA",Fatal Shooting,29.850129,-95.341567
202,8-16-21,Investigation into Fatal Shooting at 9000 West...,"9000 Westheimer Road, Houston, TX, USA",Fatal Shooting,29.73803,-95.52306
203,8-16-21,Investigation into Fatal Crash at 10100 Bisson...,"10100 Bissonnet Street, Houston, TX, USA",Fatal Crash,29.674939,-95.556565
204,8-16-21,Investigation into Fatal Crash at 4600 Fannin ...,"4600 Fannin Street, Houston, TX, USA",Fatal Crash,29.732182,-95.38339


In [30]:
df3 = df.iloc[400:600,]
latitude=[]
longitude=[]
for address in df3['address']:
    latitude.append(geolocator.geocode(address).latitude)
    longitude.append(geolocator.geocode(address).longitude)
print(len(latitude))

200


In [31]:
df3['latitude'] = latitude
df3['longitude'] = longitude
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,data,event,address,event_type,latitude,longitude
400,2-20-21,Investigation into Fatal Shooting at 5402 Chen...,"5402 Chennault Road, Houston, TX, USA",Fatal Shooting,29.682975,-95.337839
401,2-20-21,Investigation into Fatal Crash at 12000 Almeda...,"12000 Almeda Road, Houston, TX, USA",Fatal Crash,29.641639,-95.40964
402,2-20-21,Investigation into Fatal Shooting at 12610 Ash...,"12610 Ashford Meadow Drive, Houston, TX, USA",Fatal Shooting,29.733859,-95.6074
403,2-18-21,Investigation into Fatal Shooting at 7914 Lave...,"7914 Lavender Street, Houston, TX, USA",Fatal Shooting,29.832598,-95.321074
404,2-18-21,Investigation into Fatal Shooting of Armed Rob...,"7405 Airline Drive, Houston, TX, USA",Fatal Shooting,29.86846,-95.385643


In [32]:
df4 = df.iloc[600:800,]
latitude=[]
longitude=[]
for address in df4['address']:
    latitude.append(geolocator.geocode(address).latitude)
    longitude.append(geolocator.geocode(address).longitude)
print(len(latitude))

200


In [33]:
df4['latitude'] = latitude
df4['longitude'] = longitude
df4.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,data,event,address,event_type,latitude,longitude
600,01-07-22,Investigation into Fatal Crash at 3800 Gulf Fr...,"3800 Gulf Freeway, Houston, TX, USA",Fatal Crash,29.695659,-95.289485
601,01-06-22,Investigation into Fatal Shooting at 10934 Che...,"10934 Cheeves Drive, Houston, TX, USA",Fatal Shooting,29.874732,-95.292671
602,01-06-22,Investigation into Fatal Shooting at 6300 Scar...,"6300 Scarlet Drive, Houston, TX, USA",Fatal Shooting,29.647392,-95.317868
603,01-05-22,Investigation into Fatal Crash at 9300 Pagewoo...,"9300 Pagewood Lane, Houston, TX, USA",Fatal Crash,29.726007,-95.529161
604,01-04-22,Investigation into Fatal Crash at 13900 Main S...,"13900 Main Street, Houston, TX, USA",Fatal Crash,29.644148,-95.470061


In [34]:
df5 = df.iloc[800:1000,]
latitude=[]
longitude=[]
for address in df5['address']:
    latitude.append(geolocator.geocode(address).latitude)
    longitude.append(geolocator.geocode(address).longitude)
print(len(latitude))

200


In [35]:
df5['latitude'] = latitude
df5['longitude'] = longitude
df5.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,data,event,address,event_type,latitude,longitude
800,6-14-21,Investigation into Fatal Shooting at 7750 Drou...,"7750 Drouet Street, Houston, TX, USA",Fatal Shooting,29.668887,-95.282776
801,6-14-21,Investigation into Fatal Shooting at 218 Penns...,"218 Pennsylvania Street, Houston, TX, USA",Fatal Shooting,29.737224,-95.258221
802,6-11-21,Investigation into Fatal Shooting at 9000 Bunn...,"9000 Bunny Run Drive, Houston, TX, USA",Fatal Shooting,29.889684,-95.418197
803,6-11-21,Investigation into Fatal Shooting at 12601 Sou...,"12601 South Green Road, Houston, TX, USA",Fatal Shooting,29.620264,-95.215861
804,6-10-21,Investigation into Fatal Shooting at 6023 Glen...,"6023 Glenhurst Drive, Houston, TX, USA",Fatal Shooting,29.670888,-95.327347


In [37]:
df6 = df.iloc[1000:1200,]
latitude=[]
longitude=[]
for address in df6['address']:
    latitude.append(geolocator.geocode(address).latitude)
    longitude.append(geolocator.geocode(address).longitude)
print(len(latitude))

200


In [38]:
df6['latitude'] = latitude
df6['longitude'] = longitude
df6.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,data,event,address,event_type,latitude,longitude
1000,05-18-20,Investigation into Fatal Crash at 12018 Briar ...,"12018 Briar Forest Drive, Houston, TX, USA",Fatal Crash,29.751917,-95.592521
1001,05-18-20,Investigation into Fatal Crash at 11515 U.S. H...,"11515 U.S. Highway 90, Houston, TX, USA",Fatal Crash,30.061479,-94.232592
1002,05-15-20,Investigation into Fatal Shooting at 13223 Cha...,"13223 Champions Centre Drive, Houston, TX, USA",Fatal Shooting,29.968207,-95.538229
1003,05-14-20,Investigation into Fatal Shooting at 2800 Post...,"2800 Post Oak Boulevard, Houston, TX, USA",Fatal Shooting,29.737077,-95.462218
1004,05-12-20,Investigation into Fatal Crash at 12700 Northw...,"12700 Northwest Freeway, Houston, TX, USA",Fatal Crash,29.84038,-95.489481


In [41]:
df7 = df.iloc[1200:1400,]
latitude=[]
longitude=[]
for address in df7['address']:
    if geolocator.geocode(address) == None:
        latitude.append(0)
        longitude.append(0)
    else:
        latitude.append(geolocator.geocode(address).latitude)
        longitude.append(geolocator.geocode(address).longitude)
print(len(latitude))

200


In [42]:
df7['latitude'] = latitude
df7['longitude'] = longitude
df7.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,data,event,address,event_type,latitude,longitude
1200,11-02-21,Investigation into Fatal Crash at 5400 Westhei...,"5400 Westheimer Road, Houston, TX, USA",Fatal Crash,29.738785,-95.472753
1201,11-02-21,Investigation into Fatal Crash at 4300 Telepho...,"4300 Telephone Road, Houston, TX, USA",Fatal Crash,29.70293,-95.304678
1202,11-01-21,Investigation into Fatal Shooting at 12777 Ash...,"12777 Ashford Point Drive, Houston, TX, USA",Fatal Shooting,29.718942,-95.608734
1203,11-01-21,Investigation into Fatal Shooting at 670 Maxey...,"670 Maxey Road, Houston, TX, USA",Fatal Shooting,29.777637,-95.21975
1204,11-01-21,Investigation into Fatal Crash at 4400 South S...,"4400 South Sam Houston Parkway East, Houston, ...",Fatal Crash,29.598787,-95.346948


In [43]:
df8 = df.iloc[1400:,]
latitude=[]
longitude=[]
for address in df8['address']:
    if geolocator.geocode(address) == None:
        latitude.append(0)
        longitude.append(0)
    else:
        latitude.append(geolocator.geocode(address).latitude)
        longitude.append(geolocator.geocode(address).longitude)
print(len(latitude))

113


In [44]:
df8['latitude'] = latitude
df8['longitude'] = longitude
df8.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,data,event,address,event_type,latitude,longitude
1400,09-14-20,Investigation into Fatal Crash at 12700 Westhe...,"12700 Westheimer Road, Houston, TX, USA",Fatal Crash,29.737135,-95.606573
1401,09-14-20,Investigation into Fatal Shooting at 2910 Reed...,"2910 Reed Road, Houston, TX, USA",Fatal Shooting,29.65701,-95.382218
1402,09-14-20,Investigation into Fatal Crash at 3800 Goodhop...,"3800 Goodhope Street, Houston, TX, USA",Fatal Crash,29.68253,-95.368819
1403,09-14-20,Investigation into Fatal Shooting at 6600 Nort...,"6600 North Wayside Drive, Houston, TX, USA",Fatal Shooting,29.823126,-95.284045
1404,09-11-20,Investigation into Fatal Shooting at 10121 Win...,"10121 Windmill Lake Boulevard, Houston, TX, USA",Fatal Shooting,29.62013,-95.240984


In [45]:
dfAll = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8])
dfAll.shape

(1513, 6)

In [48]:
dfAll = dfAll[dfAll['latitude']>0]
dfAll.head()

Unnamed: 0,data,event,address,event_type,latitude,longitude
0,04-30-20,Investigation into Fatal Crash at 5800 Eastex ...,"5800 Eastex Freeway, Houston, TX, USA",Fatal Crash,29.812368,-95.331982
1,04-28-20,Investigation into Fatal Crash at 4300 Telepho...,"4300 Telephone Road, Houston, TX, USA",Fatal Crash,29.70293,-95.304678
2,04-28-20,Investigation into Fatal Shooting at 3826 Seab...,"3826 Seabrook Street, Houston, TX, USA",Fatal Shooting,29.684594,-95.366678
3,04-27-20,Investigation into Fatal Shooting at 8301 Darl...,"8301 Darlington Drive, Houston, TX, USA",Fatal Shooting,29.834904,-95.275063
4,04-27-20,Investigation into Fatal Crash at 11100 Almeda...,"11100 Almeda Road, Houston, TX, USA",Fatal Crash,29.66269,-95.400054


In [3]:
#dfAll.to_csv(r"C:\Users\Fate\Downloads\ML\HPD\allAddress.csv")
dfAll = pd.read_csv(r"C:\Users\Fate\Downloads\ML\HPD\allAddressRoad.csv")
dfAll = dfAll.iloc[:,1:]
dfAll.head()

Unnamed: 0,data,event,address,event_type,latitude,longitude,road,road_name
0,04-30-20,Investigation into Fatal Crash at 5800 Eastex ...,"5800 Eastex Freeway, Houston, TX, USA",Fatal Crash,29.812368,-95.331982,5800 Eastex Freeway,Eastex Freeway
1,04-28-20,Investigation into Fatal Crash at 4300 Telepho...,"4300 Telephone Road, Houston, TX, USA",Fatal Crash,29.70293,-95.304678,4300 Telephone Road,Telephone Road
2,04-28-20,Investigation into Fatal Shooting at 3826 Seab...,"3826 Seabrook Street, Houston, TX, USA",Fatal Shooting,29.684594,-95.366678,3826 Seabrook Street,Seabrook Street
3,04-27-20,Investigation into Fatal Shooting at 8301 Darl...,"8301 Darlington Drive, Houston, TX, USA",Fatal Shooting,29.834904,-95.275063,8301 Darlington Drive,Darlington Drive
4,04-27-20,Investigation into Fatal Crash at 11100 Almeda...,"11100 Almeda Road, Houston, TX, USA",Fatal Crash,29.66269,-95.400054,11100 Almeda Road,Almeda Road


In [4]:
dfAll['road'] = np.nan
dfAll.head()

Unnamed: 0.1,Unnamed: 0,data,event,address,event_type,latitude,longitude,road
0,0,04-30-20,Investigation into Fatal Crash at 5800 Eastex ...,"5800 Eastex Freeway, Houston, TX, USA",Fatal Crash,29.812368,-95.331982,
1,1,04-28-20,Investigation into Fatal Crash at 4300 Telepho...,"4300 Telephone Road, Houston, TX, USA",Fatal Crash,29.70293,-95.304678,
2,2,04-28-20,Investigation into Fatal Shooting at 3826 Seab...,"3826 Seabrook Street, Houston, TX, USA",Fatal Shooting,29.684594,-95.366678,
3,3,04-27-20,Investigation into Fatal Shooting at 8301 Darl...,"8301 Darlington Drive, Houston, TX, USA",Fatal Shooting,29.834904,-95.275063,
4,4,04-27-20,Investigation into Fatal Crash at 11100 Almeda...,"11100 Almeda Road, Houston, TX, USA",Fatal Crash,29.66269,-95.400054,


In [12]:
filter(str.isalpha, "5800 Eastex Freeway")

<filter at 0x1fd0bee15c0>

In [13]:
for i in range(len(dfAll)):
    dfAll.iloc[i,7] = dfAll.iloc[i,3].split(",")[0]
dfAll.head()

Unnamed: 0.1,Unnamed: 0,data,event,address,event_type,latitude,longitude,road
0,0,04-30-20,Investigation into Fatal Crash at 5800 Eastex ...,"5800 Eastex Freeway, Houston, TX, USA",Fatal Crash,29.812368,-95.331982,5800 Eastex Freeway
1,1,04-28-20,Investigation into Fatal Crash at 4300 Telepho...,"4300 Telephone Road, Houston, TX, USA",Fatal Crash,29.70293,-95.304678,4300 Telephone Road
2,2,04-28-20,Investigation into Fatal Shooting at 3826 Seab...,"3826 Seabrook Street, Houston, TX, USA",Fatal Shooting,29.684594,-95.366678,3826 Seabrook Street
3,3,04-27-20,Investigation into Fatal Shooting at 8301 Darl...,"8301 Darlington Drive, Houston, TX, USA",Fatal Shooting,29.834904,-95.275063,8301 Darlington Drive
4,4,04-27-20,Investigation into Fatal Crash at 11100 Almeda...,"11100 Almeda Road, Houston, TX, USA",Fatal Crash,29.66269,-95.400054,11100 Almeda Road


In [14]:
dfAll['road_name'] = np.nan
dfAll.head()

Unnamed: 0.1,Unnamed: 0,data,event,address,event_type,latitude,longitude,road,road_name
0,0,04-30-20,Investigation into Fatal Crash at 5800 Eastex ...,"5800 Eastex Freeway, Houston, TX, USA",Fatal Crash,29.812368,-95.331982,5800 Eastex Freeway,
1,1,04-28-20,Investigation into Fatal Crash at 4300 Telepho...,"4300 Telephone Road, Houston, TX, USA",Fatal Crash,29.70293,-95.304678,4300 Telephone Road,
2,2,04-28-20,Investigation into Fatal Shooting at 3826 Seab...,"3826 Seabrook Street, Houston, TX, USA",Fatal Shooting,29.684594,-95.366678,3826 Seabrook Street,
3,3,04-27-20,Investigation into Fatal Shooting at 8301 Darl...,"8301 Darlington Drive, Houston, TX, USA",Fatal Shooting,29.834904,-95.275063,8301 Darlington Drive,
4,4,04-27-20,Investigation into Fatal Crash at 11100 Almeda...,"11100 Almeda Road, Houston, TX, USA",Fatal Crash,29.66269,-95.400054,11100 Almeda Road,


In [16]:
for i in range(len(dfAll)):
    dfAll.iloc[i,8] = ' '.join(dfAll.iloc[i,7].split(" ")[1:])
dfAll.head()

Unnamed: 0.1,Unnamed: 0,data,event,address,event_type,latitude,longitude,road,road_name
0,0,04-30-20,Investigation into Fatal Crash at 5800 Eastex ...,"5800 Eastex Freeway, Houston, TX, USA",Fatal Crash,29.812368,-95.331982,5800 Eastex Freeway,Eastex Freeway
1,1,04-28-20,Investigation into Fatal Crash at 4300 Telepho...,"4300 Telephone Road, Houston, TX, USA",Fatal Crash,29.70293,-95.304678,4300 Telephone Road,Telephone Road
2,2,04-28-20,Investigation into Fatal Shooting at 3826 Seab...,"3826 Seabrook Street, Houston, TX, USA",Fatal Shooting,29.684594,-95.366678,3826 Seabrook Street,Seabrook Street
3,3,04-27-20,Investigation into Fatal Shooting at 8301 Darl...,"8301 Darlington Drive, Houston, TX, USA",Fatal Shooting,29.834904,-95.275063,8301 Darlington Drive,Darlington Drive
4,4,04-27-20,Investigation into Fatal Crash at 11100 Almeda...,"11100 Almeda Road, Houston, TX, USA",Fatal Crash,29.66269,-95.400054,11100 Almeda Road,Almeda Road


In [17]:
dfAll = dfAll.iloc[:,1:]
dfAll.to_csv(r"C:\Users\Fate\Downloads\ML\HPD\allAddressRoad.csv")

In [9]:
from geopy.geocoders import Nominatim, GoogleV3
  
# initialize Nominatim API 
locator = Nominatim(user_agent="geoapiExercises", timeout=None)
geolocator = GoogleV3(api_key = "AIzaSyDYRmuTlApwfH9a699U6z82AaptC93M3FQ")
  
# place input by geek
place = "5500 North Freeway, Houston, TX, USA"
location = locator.geocode(place)
  
# traverse the data
data = location.raw
# print(str(geolocator.geocode(place)).split(",")[2].split(" ")[2])
loc_data = data['display_name'].split()
print("Full Location")
print(loc_data)
print("Zip code : ",loc_data[-3])

Full Location
['North', 'Freeway,', 'Houston,', 'Harris', 'County,', 'Texas,', '77022,', 'United', 'States']
Zip code :  77022,


In [5]:
dfAll.shape

(1496, 7)

In [27]:
df1 = dfAll.iloc[:200,:]
df2 = dfAll.iloc[200:400,:]
df3 = dfAll.iloc[400:600,:]
# df4 = dfAll.iloc[600:800,:]
# df5 = dfAll.iloc[800:1000,:]
# df6 = dfAll.iloc[1000:1200,:]
# df7 = dfAll.iloc[1200:1400,:]
# df8 = dfAll.iloc[1400:,:]

In [None]:
place = "5800 Eastex Freeway, Houston, TX, USA"
geolocation = locator.geocode(place)
  
# traverse the data
data = location.raw
print(str(geolocator.geocode(place)).split(",")[2].split(" ")[2])

In [32]:
zip = []
for i in df3['address']:
    if locator.geocode(i) == None:
        zip.append("00000")
    else:
        zip.append(locator.geocode(i).raw['display_name'].split()[-3][:5])
        #zip.append(str(geolocator.geocode(i)).split(",")[2].split(" ")[2])
len(zip)

200

In [33]:
df3['zip'] = zip
df3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,data,event,address,event_type,latitude,longitude,road,road_name,zip
400,2-20-21,Investigation into Fatal Shooting at 5402 Chen...,"5402 Chennault Road, Houston, TX, USA",Fatal Shooting,29.682975,-95.337839,5402 Chennault Road,Chennault Road,77033
401,2-20-21,Investigation into Fatal Crash at 12000 Almeda...,"12000 Almeda Road, Houston, TX, USA",Fatal Crash,29.641639,-95.40964,12000 Almeda Road,Almeda Road,77045
402,2-20-21,Investigation into Fatal Shooting at 12610 Ash...,"12610 Ashford Meadow Drive, Houston, TX, USA",Fatal Shooting,29.733859,-95.6074,12610 Ashford Meadow Drive,Ashford Meadow Drive,77082
403,2-18-21,Investigation into Fatal Shooting at 7914 Lave...,"7914 Lavender Street, Houston, TX, USA",Fatal Shooting,29.832598,-95.321074,7914 Lavender Street,Lavender Street,77016
404,2-18-21,Investigation into Fatal Shooting of Armed Rob...,"7405 Airline Drive, Houston, TX, USA",Fatal Shooting,29.86846,-95.385643,7405 Airline Drive,Airline Drive,77076


In [105]:
df600 = pd.concat([df1,df2,df3])
df600.shape

(600, 7)

In [26]:
df4.head()

Unnamed: 0,data,event,address,event_type,latitude,longitude,road,road_name,zip
600,01-07-22,Investigation into Fatal Crash at 3800 Gulf Fr...,"3800 Gulf Freeway, Houston, TX, USA",Fatal Crash,29.695659,-95.289485,3800 Gulf Freeway,Gulf Freeway,77587
601,01-06-22,Investigation into Fatal Shooting at 10934 Che...,"10934 Cheeves Drive, Houston, TX, USA",Fatal Shooting,29.874732,-95.292671,10934 Cheeves Drive,Cheeves Drive,77016
602,01-06-22,Investigation into Fatal Shooting at 6300 Scar...,"6300 Scarlet Drive, Houston, TX, USA",Fatal Shooting,29.647392,-95.317868,6300 Scarlet Drive,Scarlet Drive,77048
603,01-05-22,Investigation into Fatal Crash at 9300 Pagewoo...,"9300 Pagewood Lane, Houston, TX, USA",Fatal Crash,29.726007,-95.529161,9300 Pagewood Lane,Pagewood Lane,77063
604,01-04-22,Investigation into Fatal Crash at 13900 Main S...,"13900 Main Street, Houston, TX, USA",Fatal Crash,29.644148,-95.470061,13900 Main Street,Main Street,77035


In [34]:
df = pd.concat([df1,df2,df3,df4,df5,df6,df7,df8])
df.head()

Unnamed: 0,data,event,address,event_type,latitude,longitude,road,road_name,zip
0,04-30-20,Investigation into Fatal Crash at 5800 Eastex ...,"5800 Eastex Freeway, Houston, TX, USA",Fatal Crash,29.812368,-95.331982,5800 Eastex Freeway,Eastex Freeway,77026
1,04-28-20,Investigation into Fatal Crash at 4300 Telepho...,"4300 Telephone Road, Houston, TX, USA",Fatal Crash,29.70293,-95.304678,4300 Telephone Road,Telephone Road,77087
2,04-28-20,Investigation into Fatal Shooting at 3826 Seab...,"3826 Seabrook Street, Houston, TX, USA",Fatal Shooting,29.684594,-95.366678,3826 Seabrook Street,Seabrook Street,77021
3,04-27-20,Investigation into Fatal Shooting at 8301 Darl...,"8301 Darlington Drive, Houston, TX, USA",Fatal Shooting,29.834904,-95.275063,8301 Darlington Drive,Darlington Drive,77028
4,04-27-20,Investigation into Fatal Crash at 11100 Almeda...,"11100 Almeda Road, Houston, TX, USA",Fatal Crash,29.66269,-95.400054,11100 Almeda Road,Almeda Road,77021


In [35]:
df.to_csv(r"C:\Users\Fate\Downloads\ML\HPD\df3years.csv",index=False)

In [36]:
df['zip'].value_counts()

00000    109
77022     56
77036     48
77074     41
77034     40
        ... 
77450      1
77083      1
77014      1
77346      1
77058      1
Name: zip, Length: 99, dtype: int64