In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import datetime
from datetime import date
from datetime import time
from datetime import datetime
#import calendar
import warnings
warnings.filterwarnings('ignore')
import io

In [2]:
# https://nikgrozev.com/2015/06/16/fast-and-simple-sampling-in-pandas-when-loading-data-from-files/
# Fast and Simple Sampling in Pandas when Loading Data From Files

import random
# The data to load
f = "Resources/LA_crime.csv"

# Count the lines
num_lines = sum(1 for l in open(f))

# Sample size - in this case ~10%
size = int(num_lines / 10)

# The row indices to skip - make sure 0 is not included to keep the header!
skip_idx = random.sample(range(1, num_lines), num_lines - size)

# Read the data
la_crime_df = pd.read_csv(f, skiprows=skip_idx)
la_crime_df.head(2)

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Weapon Description,Status Code,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location
0,191504131,01/03/2019,01/02/2018,2330,15,N Hollywood,1535,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),344.0,...,,IC,Invest Cont,420,,,,TUJUNGA,HATTERAS,"(34.1758, -118.379)"
1,190704048,01/02/2019,01/02/2018,1000,7,Wilshire,774,510,VEHICLE - STOLEN,,...,,AA,Adult Arrest,510,,,,400 S CROCKER ST,,"(34.0431, -118.3482)"


In [3]:
# Remove parantesis from Location column
la_crime_df['Location_clean']=la_crime_df['Location '].str.strip('()')
la_crime_df.head(2)

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Status Code,Status Description,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location,Location_clean
0,191504131,01/03/2019,01/02/2018,2330,15,N Hollywood,1535,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),344.0,...,IC,Invest Cont,420,,,,TUJUNGA,HATTERAS,"(34.1758, -118.379)","34.1758, -118.379"
1,190704048,01/02/2019,01/02/2018,1000,7,Wilshire,774,510,VEHICLE - STOLEN,,...,AA,Adult Arrest,510,,,,400 S CROCKER ST,,"(34.0431, -118.3482)","34.0431, -118.3482"


In [4]:
## https://chrisalbon.com/python/data_wrangling/pandas_split_lat_and_long_into_variables/
# Create two lists for the loop results to be placed
lat = []
lon = []

# For each row in a varible,
for row in la_crime_df['Location_clean']:
    # Try to,
    try:
        # Split the row by comma and append
        # everything before the comma to lat
        lat.append(row.split(',')[0])
        # Split the row by comma and append
        # everything after the comma to lon
        lon.append(row.split(',')[1])
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
la_crime_df['Latitude'] = lat
la_crime_df['Longitude'] = lon

In [5]:
la_crime_df.head(2)

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location,Location_clean,Latitude,Longitude
0,191504131,01/03/2019,01/02/2018,2330,15,N Hollywood,1535,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),344.0,...,420,,,,TUJUNGA,HATTERAS,"(34.1758, -118.379)","34.1758, -118.379",34.1758,-118.379
1,190704048,01/02/2019,01/02/2018,1000,7,Wilshire,774,510,VEHICLE - STOLEN,,...,510,,,,400 S CROCKER ST,,"(34.0431, -118.3482)","34.0431, -118.3482",34.0431,-118.3482


In [6]:
print(f"LA Crime number of rows = {la_crime_df['DR Number'].count()}")

LA Crime number of rows = 7918


In [7]:
# Strip parentheses from column
la_crime_df['Location_clean']=la_crime_df['Location '].str.strip('()')
la_crime_df.head(2)

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Crime Code 1,Crime Code 2,Crime Code 3,Crime Code 4,Address,Cross Street,Location,Location_clean,Latitude,Longitude
0,191504131,01/03/2019,01/02/2018,2330,15,N Hollywood,1535,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),344.0,...,420,,,,TUJUNGA,HATTERAS,"(34.1758, -118.379)","34.1758, -118.379",34.1758,-118.379
1,190704048,01/02/2019,01/02/2018,1000,7,Wilshire,774,510,VEHICLE - STOLEN,,...,510,,,,400 S CROCKER ST,,"(34.0431, -118.3482)","34.0431, -118.3482",34.0431,-118.3482


In [8]:
# Separate Date column by day, month and year and the complete date without the hours

In [9]:
la_crime_df['Year'] = [i[8:10] for i in la_crime_df['Date Reported']]

In [10]:
la_crime_df['Month'] = [i[3:5] for i in la_crime_df['Date Reported']]

In [11]:
la_crime_df['Day'] = [i[0:2] for i in la_crime_df['Date Reported']]

In [12]:
la_crime_df['Parking_Date'] = [i[0:10] for i in la_crime_df['Date Reported']]

In [13]:
la_crime_df.head(2)

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Address,Cross Street,Location,Location_clean,Latitude,Longitude,Year,Month,Day,Parking_Date
0,191504131,01/03/2019,01/02/2018,2330,15,N Hollywood,1535,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),344.0,...,TUJUNGA,HATTERAS,"(34.1758, -118.379)","34.1758, -118.379",34.1758,-118.379,19,3,1,01/03/2019
1,190704048,01/02/2019,01/02/2018,1000,7,Wilshire,774,510,VEHICLE - STOLEN,,...,400 S CROCKER ST,,"(34.0431, -118.3482)","34.0431, -118.3482",34.0431,-118.3482,19,2,1,01/02/2019


In [14]:
la_crime_df.columns

Index(['DR Number', 'Date Reported', 'Date Occurred', 'Time Occurred',
       'Area ID', 'Area Name', 'Reporting District', 'Crime Code',
       'Crime Code Description', 'MO Codes', 'Victim Age', 'Victim Sex',
       'Victim Descent', 'Premise Code', 'Premise Description',
       'Weapon Used Code', 'Weapon Description', 'Status Code',
       'Status Description', 'Crime Code 1', 'Crime Code 2', 'Crime Code 3',
       'Crime Code 4', 'Address', 'Cross Street', 'Location ',
       'Location_clean', 'Latitude', 'Longitude', 'Year', 'Month', 'Day',
       'Parking_Date'],
      dtype='object')

In [15]:
# select columns to use for the project
la_crime_df_new = la_crime_df[['DR Number', 'Date Reported', 'Date Occurred', 'Time Occurred',
       'Area ID', 'Area Name', 'Reporting District', 'Crime Code',
       'Crime Code Description', 'MO Codes', 'Victim Age', 'Victim Sex',
       'Victim Descent', 'Premise Code', 'Premise Description',
       'Weapon Used Code', 'Weapon Description', 'Status Code',
       'Status Description', 'Crime Code 1', 'Crime Code 2', 'Crime Code 3',
       'Crime Code 4', 'Address', 'Cross Street','Location_clean', 'Latitude', 'Longitude', 'Year', 'Month', 'Day',
       'Parking_Date']]

In [16]:
la_crime_df_new.head(2)

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Crime Code 4,Address,Cross Street,Location_clean,Latitude,Longitude,Year,Month,Day,Parking_Date
0,191504131,01/03/2019,01/02/2018,2330,15,N Hollywood,1535,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),344.0,...,,TUJUNGA,HATTERAS,"34.1758, -118.379",34.1758,-118.379,19,3,1,01/03/2019
1,190704048,01/02/2019,01/02/2018,1000,7,Wilshire,774,510,VEHICLE - STOLEN,,...,,400 S CROCKER ST,,"34.0431, -118.3482",34.0431,-118.3482,19,2,1,01/02/2019


In [17]:
# save df to a csv file
la_crime_df_new.to_csv(r'Resources\la_crime_df_new.csv')

### Connect to local database

In [18]:
# Connect to local database
rds_connection_string = "root:modelobootcamp@127.0.0.1/ladata_db"
engine = create_engine(f'mysql://{rds_connection_string}')
connection = engine.connect()

In [19]:
connection.execute('use ladata_db;')

<sqlalchemy.engine.result.ResultProxy at 0x264fd217e80>

In [20]:
engine.table_names()

['la_parking_df_project']

In [21]:
la_crime_df_new.to_sql(name='la_crime_df_project', con=engine, if_exists='replace', index=False)

In [22]:
results = engine.execute('select * from la_crime_df_project;')

### Confirm data has been added by querying the la_parking_df table

In [23]:
connection.execute('use ladata_db;')

<sqlalchemy.engine.result.ResultProxy at 0x264fee34d68>

In [24]:
pd.read_sql_query('select * from la_crime_df_project', con=engine).head(2)

Unnamed: 0,DR Number,Date Reported,Date Occurred,Time Occurred,Area ID,Area Name,Reporting District,Crime Code,Crime Code Description,MO Codes,...,Crime Code 4,Address,Cross Street,Location_clean,Latitude,Longitude,Year,Month,Day,Parking_Date
0,191504131,01/03/2019,01/02/2018,2330,15,N Hollywood,1535,420,THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER),344.0,...,,TUJUNGA,HATTERAS,"34.1758, -118.379",34.1758,-118.379,19,3,1,01/03/2019
1,190704048,01/02/2019,01/02/2018,1000,7,Wilshire,774,510,VEHICLE - STOLEN,,...,,400 S CROCKER ST,,"34.0431, -118.3482",34.0431,-118.3482,19,2,1,01/02/2019
