In [None]:
# import dependencies
import pandas as pd
from datetime import datetime as dt

In [None]:
# read first csv file
file1 = "Resources\Crime_Data_from_2010_to_2019.csv"
la_crime1 = pd.read_csv(file1, encoding="utf-8")
la_crime1 = la_crime1[1672533: ] # selecting only 2018 and 2019 data
la_crime1.info()

In [None]:
# read second csv file
file2 = "Resources\Crime_Data_from_2020_to_Present.csv"
la_crime2 = pd.read_csv(file2, encoding="utf-8")
la_crime2 = la_crime2[: 640388] # selecting 2020-2022 data
la_crime2.info()

In [None]:
# Getting columns names
la_crime1.columns

In [None]:
# Getting column names and comparing to first set 
la_crime2.columns
# Note that the area var as a trailing space

In [None]:
# selecting only the cariables of interest
la_crime1 = la_crime1[['DR_NO', 'DATE OCC', 'AREA ', 'AREA NAME', 'Crm Cd', 'Crm Cd Desc', 
                       'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc',
                       'Weapon Used Cd', 'Weapon Desc', 'LAT','LON']]
la_crime2 = la_crime2[['DR_NO', 'DATE OCC', 'AREA', 'AREA NAME', 'Crm Cd', 'Crm Cd Desc', 
                       'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc',
                       'Weapon Used Cd', 'Weapon Desc', 'LAT','LON']]
la_crime2.head()

In [None]:
#The column Area for the first set has a space, so needs to change it to be the same as the second set
la_crime1 = la_crime1.rename(columns={"AREA ": "AREA"})
la_crime1.head()

In [None]:
# Recode date variable

# convert the date column to datetime format
la_crime1['date'] = pd.to_datetime(la_crime1['DATE OCC'], format='%m/%d/%Y %I:%M:%S %p')
la_crime2['date'] = pd.to_datetime(la_crime2['DATE OCC'], infer_datetime_format=True)

In [None]:
# Merging the two sets
la_crime = concatenated_df = pd.concat([la_crime1, la_crime2], axis=0, ignore_index=True)
la_crime.info()

In [None]:
# check values of VICTIM variables
print("Victim Age") # Looks like some data points are inaccurate (e.g., is age 0 a baby or missing value?)
print("-" * 30)
print(la_crime["Vict Age"].value_counts())  
print("*" * 74)
print("Victim Gender") # Need to figure out what are Xs and what are Hs
print("-" * 30)
print(la_crime["Vict Sex"].value_counts())
print("*" * 74)
print("Victim Ethnicity") # Need to figure out all the codes
print("-" * 30)
print(la_crime["Vict Descent"].value_counts())
print("*" * 74)

In [None]:
# Filtering the data by valid Gender
la_crime = la_crime.loc[(la_crime["Vict Sex"] == 'F') | (la_crime["Vict Sex"] == 'M'), :]
# Filtering the data by valid Age 
la_crime = la_crime.loc[(la_crime['Vict Age'] > 0) & (la_crime["Vict Age"] <= 90), :]
temp = la_crime.loc[la_crime["Vict Age"] < 10 , :]
temp['Crm Cd Desc'].value_counts()

In [None]:
# check values of AREA variables
print("Area Code") # use as primary key for LA Area table, keep as forign key in Crimes table
print("-" * 30)
print(la_crime["AREA"].value_counts())
print("*" * 74)
print("LA Area Name") # use in LA Area table
print("-" * 30)
print(la_crime["AREA NAME"].value_counts())
print("*" * 74)

In [None]:
# check values of CRIME TYPE variables
print("Crime Code") # use as primary key for Crime Type table, keep as forign key in Crimes table
print("-" * 30)
print(la_crime["Crm Cd"].value_counts().head(20)) 
print("*" * 74)
print("Crime Code Description") # use in Crime Type table (pick only top 20 crimes)
print("-" * 30)
print(la_crime["Crm Cd Desc"].value_counts().head(20)) 
print("*" * 74)

In [None]:
# Recoding crime codes to simplify data
code_map = {'BATTERY - SIMPLE ASSAULT': 'Assault', 
'BURGLARY FROM VEHICLE': 'Burglary from Vehicle', 
'THEFT OF IDENTITY': 'Identity Theft', 
'INTIMATE PARTNER - SIMPLE ASSAULT': 'Domestic Violence', 
'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT': 'Assault', 
'THEFT PLAIN - PETTY ($950 & UNDER)': 'Theft, Robbery, or Burglary', 
'VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)': 'Vandalism', 
'BURGLARY': 'Theft, Robbery, or Burglary', 
'ROBBERY': 'Theft, Robbery, or Burglary', 
'THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LIVESTK,PROD': 'Theft, Robbery, or Burglary', 
'THEFT FROM MOTOR VEHICLE - GRAND ($950.01 AND OVER)': 'Burglary from Vehicle', 
'VANDALISM - MISDEAMEANOR ($399 OR UNDER)': 'Vandalism', 
'THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)': 'Burglary from Vehicle', 
'CRIMINAL THREATS - NO WEAPON DISPLAYED': 'Criminal threat', 
'BRANDISH WEAPON': 'Brandish Weapon', 
'INTIMATE PARTNER - AGGRAVATED ASSAULT': 'Domestic Violence', 
'VIOLATION OF RESTRAINING ORDER': 'Violation of Reatraining Order', 
'BIKE - STOLEN': 'Bike Theft', 
'TRESPASSING': 'Trespassing', 
'SHOPLIFTING - PETTY THEFT ($950 & UNDER)': 'Theft, Robbery, or Burglary', 
'BUNCO, GRAND THEFT': 'Theft, Robbery, or Burglary', 
'THEFT, PERSON': 'Theft, Robbery, or Burglary', 
'BATTERY WITH SEXUAL CONTACT': 'Sexual Assualt or Rape', 
'ATTEMPTED ROBBERY': 'Theft, Robbery, or Burglary', 
'OTHER ASSAULT': 'Assault', 
'RAPE, FORCIBLE': 'Sexual Assualt or Rape', 
'CHILD ABUSE (PHYSICAL) - SIMPLE ASSAULT': 'Child Maltreatment', 
'DOCUMENT FORGERY / STOLEN FELONY': 'Forgery', 
'BURGLARY, ATTEMPTED': 'Theft, Robbery, or Burglary', 
'BUNCO, PETTY THEFT': 'Theft, Robbery, or Burglary', 
'CRM AGNST CHLD (13 OR UNDER) (14-15 & SUSP 10 YRS OLDER)': 'Child Maltreatment', 
'VEHICLE - ATTEMPT STOLEN': 'Vehicle Theft', 
'EXTORTION': 'Extortion', 
'SEXUAL PENETRATION W/FOREIGN OBJECT': 'Sexual Assualt or Rape', 
'SHOPLIFTING-GRAND THEFT ($950.01 & OVER)': 'Theft, Robbery, or Burglary', 
'SEX,UNLAWFUL(INC MUTUAL CONSENT, PENETRATION W/ FRGN OBJ': 'Sexual Assualt or Rape', 
'CHILD NEGLECT (SEE 300 W.I.C.)': 'Child Maltreatment', 
'ORAL COPULATION': 'Sexual Assualt or Rape', 
'THEFT FROM MOTOR VEHICLE - ATTEMPT': 'Burglary from Vehicle', 
'BURGLARY FROM VEHICLE, ATTEMPTED': 'Burglary from Vehicle', 
'SODOMY/SEXUAL CONTACT B/W PENIS OF ONE PERS TO ANUS OTH': 'Sexual Assualt or Rape', 
'VEHICLE, STOLEN - OTHER (MOTORIZED SCOOTERS, BIKES, ETC)': 'Vehicle Theft', 
'CHILD ABUSE (PHYSICAL) - AGGRAVATED ASSAULT': 'Child Maltreatment', 
'RAPE, ATTEMPTED': 'Sexual Assualt or Rape', 
'THEFT PLAIN - ATTEMPT': 'Theft, Robbery, or Burglary', 
'VEHICLE - STOLEN': 'Vehicle Theft', 
'ASSAULT WITH DEADLY WEAPON ON POLICE OFFICER': 'Assault', 
'LEWD/LASCIVIOUS ACTS WITH CHILD': 'Child Maltreatment', 
'CHILD ABANDONMENT': 'Child Maltreatment'}
la_crime["crime"] = la_crime["Crm Cd Desc"].map(code_map)
la_crime['crime'].value_counts()

In [None]:
# Removing records that don't have the new crime codes
la_crime = la_crime.dropna(subset=['crime'])
la_crime.info()

In [None]:
# check values of PREMISE variables
print("Premise Code") # use as primary key for Premise table, keep as forign key in Crimes table
print("-" * 30)
print(la_crime["Premis Cd"].value_counts()) 
print("*" * 74)
print("Premise Description") # use in Premise table (consider recode low counts into 'other' or filter these records out)
print("-" * 30)
print(la_crime["Premis Desc"].value_counts()[91:120]) 
print("*" * 74)

In [None]:
# Recoding premise codes to simplify data
premise_map = {'TRANSPORTATION FACILITY (AIRPORT)': 'Airport', 
'BANK': 'Bank/ATM', 
'AUTOMATED TELLER MACHINE (ATM)': 'Bank/ATM', 
'BEACH': 'Beach', 
'MTA BUS': 'Bus Stop/Station', 
'BUS STOP': 'Bus Stop/Station', 
'MTA - RED LINE - WESTLAKE/MACARTHUR PARK': 'Bus Stop/Station', 
'MTA - RED LINE - UNION STATION': 'Bus Stop/Station', 
'MTA - RED LINE - 7TH AND METRO CENTER': 'Bus Stop/Station', 
'LA UNION STATION (NOT LINE SPECIFIC)': 'Bus Stop/Station', 
'MUNICIPAL BUS LINE INCLUDES LADOT/DASH': 'Bus Stop/Station', 
'MTA - RED LINE - PERSHING SQUARE': 'Bus Stop/Station', 
'MTA - RED LINE - HOLLYWOOD/VINE': 'Bus Stop/Station', 
'MTA - EXPO LINE - EXPO/WESTERN': 'Bus Stop/Station', 
'MTA - RED LINE - WILSHIRE/VERMONT': 'Bus Stop/Station', 
'MTA - RED LINE - HOLLYWOOD/HIGHLAND': 'Bus Stop/Station', 
'LAUNDROMAT': 'Business', 
'BAR/COCKTAIL/NIGHTCLUB': 'Business', 
'NIGHT CLUB (OPEN EVENINGS ONLY)': 'Business', 
"COFFEE SHOP (STARBUCKS, COFFEE BEAN, PEET'S, ETC.)": 'Business', 
'MEDICAL/DENTAL OFFICES': 'Business', 
'OTHER BUSINESS': 'Business', 
'RESTAURANT/FAST FOOD': 'Business', 
'HEALTH SPA/GYM': 'Business', 
'OFFICE BUILDING/OFFICE': 'Business', 
'AUTO DEALERSHIP (CHEVY, FORD, BMW, MERCEDES, ETC.)': 'Business', 
'LIBRARY': 'Business', 
'BAR/SPORTS BAR (OPEN DAY & NIGHT)': 'Business', 
'BEAUTY/BARBER SHOP': 'Business', 
'POST OFFICE': 'Business', 
'STUDIO (FILM/PHOTOGRAPHIC/MUSIC)': 'Business', 
'NAIL SALON': 'Business', 
'MEDICAL MARIJUANA FACILITIES/BUSINESSES': 'Business', 
'DELIVERY SERVICE (FED EX, UPS, COURIERS,COURIER SERVICE)*': 'Business', 
'HARDWARE/BUILDING SUPPLY': 'Business', 
'MASSAGE PARLOR': 'Business', 
'CHURCH/CHAPEL (CHANGED 03-03 FROM CHURCH/TEMPLE)': 'Church/Temple', 
'CYBERSPACE': 'Cyberspace', 
'DRIVEWAY': 'Driveway', 
'STREET': 'Freeway/Street', 
'ALLEY': 'Freeway/Street', 
'FREEWAY': 'Freeway/Street', 
'GAS STATION': 'Gas Station', 
'HOSPITAL': 'Hospital', 
'HOTEL': 'Hotel/Motel', 
'MOTEL': 'Hotel/Motel', 
'MARKET': 'Market', 
'NURSING/CONVALESCENT/RETIREMENT HOME': 'Nursing Home', 
'PARK/PLAYGROUND': 'Park', 
'PARKING LOT': 'Parking Lot/Garage', 
'PARKING UNDERGROUND/BUILDING': 'Parking Lot/Garage', 
'SINGLE FAMILY DWELLING': 'Residence', 
'MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)': 'Residence', 
'GARAGE/CARPORT': 'Residence', 
'OTHER RESIDENCE': 'Residence', 
'PORCH, RESIDENTIAL': 'Residence', 
'GROUP HOME': 'Residence', 
"MOBILE HOME/TRAILERS/CONSTRUCTION TRAILERS/RV'S/MOTORHOME": 'Residence', 
'CONDOMINIUM/TOWNHOUSE': 'Residence', 
"SINGLE RESIDENCE OCCUPANCY (SRO'S) LOCATIONS": 'Residence', 
'BALCONY*': 'Residence', 
'APARTMENT/CONDO COMMON LAUNDRY ROOM': 'Residence', 
'HIGH SCHOOL': 'School/College', 
'JUNIOR HIGH SCHOOL': 'School/College', 
'COLLEGE/JUNIOR COLLEGE/UNIVERSITY': 'School/College', 
'ELEMENTARY SCHOOL': 'School/College', 
'SPECIALTY SCHOOL/OTHER': 'School/College', 
'FRAT HOUSE/SORORITY/DORMITORY': 'School/College', 
'PRIVATE SCHOOL/PRESCHOOL': 'School/College', 
'SIDEWALK': 'Sidewalk', 
'DEPARTMENT STORE': 'Store', 
'OTHER STORE': 'Store', 
'CLOTHING STORE': 'Store', 
'LIQUOR STORE': 'Store', 
'SHOPPING MALL (COMMON AREA)': 'Store', 
'CELL PHONE STORE': 'Store', 
'MINI-MART': 'Store', 
'DRUG STORE': 'Store', 
'DISCOUNT STORE (99 CENT,DOLLAR,ETC.': 'Store', 
'MEMBERSHIP STORE (COSTCO,SAMS CLUB)*': 'Store', 
"DIY CENTER (LOWE'S,HOME DEPOT,OSH,CONTRACTORS WAREHOUSE)": 'Store', 
'JEWELRY STORE': 'Store', 
'AUTO SUPPLY STORE*': 'Store', 
'TOBACCO SHOP': 'Store', 
'ELECTRONICS STORE (IE:RADIO SHACK, ETC.)': 'Store', 
'BEAUTY SUPPLY STORE': 'Store', 
'PHARMACY INSIDE STORE OR SUPERMARKET*': 'Store', 
'MISSIONS/SHELTERS': 'Temp Housing', 
'TRANSIENT ENCAMPMENT': 'Temp Housing', 
'TRANSITIONAL HOUSING/HALFWAY HOUSE': 'Temp Housing', 
'THEATRE/MOVIE': 'Theatre/Movie', 
'VEHICLE, PASSENGER/TRUCK': 'Vehicle', 
'YARD (RESIDENTIAL/BUSINESS)': 'Yard'}
la_crime["premise"] = la_crime["Premis Desc"].map(premise_map)
la_crime['premise'].value_counts()

In [None]:
# Removing records that don't have the new premise codes
la_crime = la_crime.dropna(subset=['premise'])
la_crime.info()

In [None]:
# check values of WEAPON variables
print("Weapon Used Code") 
print("-" * 30)
print(la_crime["Weapon Used Cd"].value_counts())
print("*" * 74)
print("Weapon Description") 
print("-" * 30)
print(la_crime["Weapon Desc"].value_counts().head(30))  
print("*" * 74)  

In [None]:
# Recode Ethnicity values
ethnicity_map = {'A': 'Asian', 'B': 'Black', 'C': 'Asian', 'D': 'Other', 'F': 'Asian',
                 'G': 'Other', 'H': 'Hispanic','I': 'Native American', 'J': 'Asian',
                 'K': 'Asian', 'L': 'Other', 'O': 'Other', 'P': 'Pacific Islander/Hawaiian', 'S': 'Other',
                 'U': 'Pacific Islander/Hawaiian', 'V': 'Asian', 'W': 'White', 'Z': 'Asian Indian'}

la_crime["victim_ethnicity"] = la_crime["Vict Descent"].map(ethnicity_map)
la_crime["victim_ethnicity"].value_counts()

In [None]:
#Recode Sex values
la_crime["Vict Sex"] 
la_crime["victim_gender"] = la_crime["Vict Sex"].map({'F': 'Female', 'M': 'Male'})
la_crime["victim_gender"].value_counts()


In [None]:
#Recode age values
print(la_crime["Vict Age"].sort_values().unique())

# Define the age categories
age_bins = [0, 10, 20, 30, 40, 50, 60, 70, 80]
age_labels = ['1-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71+']

# Use the cut() function to create the Age Catg variable
la_crime['age_catg'] = pd.cut(la_crime['Vict Age'], bins=age_bins, labels=age_labels)
la_crime['age_catg'].value_counts()

In [None]:
# extract the year and month from the datetime column
la_crime['year'] = la_crime['date'].dt.year
la_crime['month'] = la_crime['date'].dt.month
la_crime['year'].value_counts()

In [None]:
# recoding month values to names
la_crime['month_name'] = la_crime['month'].map({
    1: "January", 2: "February", 3: "March", 4: "April", 5: "May", 6: "June", 7: "July", 
    8: "August", 9: "September", 10: "October", 11: "November", 12: "December"})
la_crime.head()

In [None]:
#renaming columns
la_crime = la_crime.rename(columns={
    'DR_NO': "incident_id", 'AREA': "area_code", 'AREA NAME': "area_name", 'Crm Cd': 'crime_code', 
    'Premis Cd': "premise_code", 'LAT': "lat", 'LON': 'lon'})
la_crime.columns


In [None]:
#renaming columns
la_crime = la_crime.rename(columns={'age_catg': 'victim_age'})
la_crime.columns

### Creating tables for database

In [None]:
# main table
la_crime_table = la_crime[['incident_id', 'year', 'month', 'month_name', 
                           'victim_age', 'victim_ethnicity', 'victim_gender','lat', 'lon',
                          'area_code', 'crime_code', 'premise_code']]
la_crime_table.info()

In [None]:
# Change premise_code type from float to integer
la_crime_table['premise_code'] = la_crime_table['premise_code'].astype(int)
la_crime_table.info()

In [None]:
# area table
la_area_table = la_crime[['area_code', 'area_name']].drop_duplicates().reset_index(drop = True)
la_area_table

In [None]:
# crime type table
la_crime_type_table = la_crime[['crime_code', 'crime']].drop_duplicates().reset_index(drop = True)
la_crime_type_table

In [None]:
# premise table
la_premise_table = la_crime[['premise_code', 'premise']].drop_duplicates().reset_index(drop = True)
la_premise_table['premise_code'] = la_premise_table['premise_code'].astype(int)
la_premise_table

## Setting up Database Schama

In [None]:
# import dependencies
from sqlalchemy import Column, Integer, String, Float, ForeignKey, create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship

In [33]:
# Create the engine and connect to the database
engine = create_engine('sqlite:///Resources/la_crime.db', echo=True)

# Define the Base class to use declarative syntax
Base = declarative_base() 

# if tables exist, drop them
Base.metadata.drop_all(engine)

# Define the Crime class , which will have a one-to-one relationship with the Area, premise and crime type tables
class Crime(Base):
    __tablename__ = 'crime'
    incident_id = Column(Integer, primary_key=True)
    year = Column(Integer)
    month = Column(Integer)
    month_name = Column(String)
    victim_age = Column(String)
    victim_ethnicity = Column(String)
    victim_gender = Column(String)
    lat = Column(Float)
    lon = Column(Float)
    area_code = Column(Integer, ForeignKey('area.area_code'))
    area = relationship('Area', back_populates='crime')
    crime_code = Column(Integer, ForeignKey('crime_type.crime_code'))
    crime_type = relationship('CrimeType', back_populates='crime')
    premise_code = Column(Integer, ForeignKey('premise.premise_code'))
    premise = relationship('Premise', back_populates='crime')

class Area(Base):
    __tablename__ = 'area'
    area_code = Column(Integer, primary_key=True)
    area_name = Column(String, nullable=False)
    crimes = relationship("Crime", back_populates="area")

# Define the Crime Type class, which will have a one-to-many relationship with the Crime class
class CrimeType(Base):
    __tablename__ = 'crime_type'
    crime_code = Column(Integer, primary_key=True)
    crime = Column(String, nullable=False)
    crimes = relationship("Crime", back_populates="crime_type")
    
    
# Define the premise class, which will have a one-to-many relationship with the Crime class
class Premise(Base):
    __tablename__ = 'premise'
    premise_code = Column(Integer, primary_key=True)
    premise = Column(String, nullable=False)
    crimes = relationship("Crime", back_populates="premise")


# Create the tables in the database
Base.metadata.create_all(engine)

2023-04-09 12:54:43,931 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2023-04-09 12:54:43,932 INFO sqlalchemy.engine.base.Engine ()
2023-04-09 12:54:43,934 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2023-04-09 12:54:43,935 INFO sqlalchemy.engine.base.Engine ()
2023-04-09 12:54:43,954 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("crime")
2023-04-09 12:54:43,955 INFO sqlalchemy.engine.base.Engine ()
2023-04-09 12:54:43,956 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("crime")
2023-04-09 12:54:43,958 INFO sqlalchemy.engine.base.Engine ()
2023-04-09 12:54:43,959 INFO sqlalchemy.engine.base.Engine PRAGMA main.table_info("area")
2023-04-09 12:54:43,960 INFO sqlalchemy.engine.base.Engine ()
2023-04-09 12:54:43,961 INFO sqlalchemy.engine.base.Engine PRAGMA temp.table_info("area")
2023-04-09 12:54:43,961 INFO sqlalchemy.engine.base.Engine ()
2023-04-09 12:54:43,962 INFO

### Creating SQLite database

In [34]:
import sqlite3
conn = sqlite3.connect('Resources/la_crime.db')
la_crime_table.to_sql('crime', conn, if_exists='replace', index=False)
la_area_table.to_sql('area', conn, if_exists='replace', index=False)
la_crime_type_table.to_sql('crime_type', conn, if_exists='replace', index=False)
la_premise_table.to_sql('premise', conn, if_exists='replace', index=False)