In [1]:
import pandas as pd
import pickle
pd.set_option('display.max_columns', None)

In [157]:
# read in data from 2010-2019
df_2019 = pd.read_csv("https://data.lacity.org/api/views/63jg-8b9z/rows.csv")

In [142]:
# read in data from 2020
df_2020 = pd.read_csv("https://data.lacity.org/api/views/2nrs-mtv8/rows.csv")

In [158]:
# check if column names are the same
df_2020.columns == df_2019.columns

array([ True,  True,  True,  True, False,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [159]:
# remove trailing space in 'AREA' so the column names match between the two dataframes
df_2019.rename(columns={"AREA ": "AREA"}, inplace = True)

In [204]:
# combine data into one dataframe
df = pd.concat([df_2019, df_2020])

In [205]:
# Map the lowering function to all column names
df.columns = map(str.lower, df.columns)

In [206]:
# Remove spaces from column names
df.columns = df.columns.str.replace(" ", "_")

In [207]:
## Convert date columns to date objects
df["date_rptd"] = pd.to_datetime(df["date_rptd"])
df["date_occ"]= pd.to_datetime(df["date_occ"])

In [208]:
df.shape

(2210104, 28)

## How many crimes were reported over the past 5 years?

In [229]:
# Check how many crimes were reported over the past 5 years
# Calculate current date and offset by 5 years
(df["date_rptd"] > (pd.datetime.now()- pd.DateOffset(years=5))).sum()

1118305

### A: 111,8305 crimes were reported in the past 5 years

## List the top five reported crimes for each year for the past 5 years.

In [230]:
# Select the last 5 years as a dataframe
df_5 = df[df["date_rptd"] > (pd.datetime.now()- pd.DateOffset(years=5))]

In [240]:
df_5["date_rptd"][0].year

2020

In [245]:
df_5.iloc[0]["date_rptd"].year

2015

In [246]:
df_5['year'] = pd.DatetimeIndex(df_5['date_rptd']).year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [248]:
df_5.head()

Unnamed: 0,dr_no,date_rptd,date_occ,time_occ,area,area_name,rpt_dist_no,part_1-2,crm_cd,crm_cd_desc,mocodes,vict_age,vict_sex,vict_descent,premis_cd,premis_desc,weapon_used_cd,weapon_desc,status,status_desc,crm_cd_1,crm_cd_2,crm_cd_3,crm_cd_4,location,cross_street,lat,lon,year
207941,150116398,2015-07-09,2010-07-09,2125,1,Central,111,2,624,BATTERY - SIMPLE ASSAULT,0416 1218 2004,47,M,H,102.0,SIDEWALK,400.0,"STRONG-ARM (HANDS, FIST, FEET OR BODILY FORCE)",AO,Adult Other,624.0,,,,ALAMEDA,ALHAMBRA,34.0601,-118.2366,2015
207952,150214866,2015-07-21,2010-03-01,1200,2,Rampart,238,1,341,"THEFT-GRAND ($950.01 & OVER)EXCPT,GUNS,FOWL,LI...",0344 0352,55,F,H,501.0,SINGLE FAMILY DWELLING,,,IC,Invest Cont,341.0,,,,1400 W TEMPLE ST,,34.0673,-118.2566,2015
207953,150215013,2015-07-24,2010-03-16,800,2,Rampart,233,2,354,THEFT OF IDENTITY,0100,34,F,K,502.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",,,IC,Invest Cont,354.0,,,,200 S LAFAYETTE PARK PL,,34.0692,-118.2792,2015
207954,150218856,2015-10-02,2010-04-01,1000,2,Rampart,248,2,354,THEFT OF IDENTITY,0100 1822,52,M,A,502.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",,,IC,Invest Cont,354.0,,,,300 COLUMBIA AV,,34.0594,-118.2644,2015
207955,150223156,2015-12-28,2010-01-01,900,2,Rampart,212,2,354,THEFT OF IDENTITY,0928 1822,20,M,H,502.0,"MULTI-UNIT DWELLING (APARTMENT, DUPLEX, ETC)",,,IC,Invest Cont,354.0,,,,600 N WESTMORELAND AV,,34.0818,-118.288,2015


In [262]:
df_5.groupby("year")["crm_cd_1"].agg(pd.Series.mode)

year
2015    108903
2016    226101
2017    231787
2018    230683
2019    220565
2020    100259
Name: crm_cd_1, dtype: int64

In [267]:
df_5.groupby("year")["crm_cd_1"].value_counts(ascending = False)[:5].index.tolist()

[(2015, 624.0), (2015, 510.0), (2015, 440.0), (2015, 330.0), (2015, 354.0)]

## Pickle for later use

In [191]:
# save data for later use
pickle.dump(df, open("./data/crime_data.pkl", "wb"))

In [268]:
# save data for later use
pickle.dump(df_5, open("./data/crime_data_5.pkl", "wb"))