# 3.Preprocessing and Training

### Import Libaries

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler,OneHotEncoder,LabelEncoder
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression


### 1.Load Data

In [2]:
datafilepath = "../data/interim/top_5_crime_data.csv"
df_top5_crime = pd.read_csv(datafilepath,index_col =0)

In [3]:
df_top5_crime.head(3)

Unnamed: 0_level_0,TIME_OCC,AREA,AREA_NAME,CRM_CD,CRM_CD_DESC,VICT_AGE,VICT_SEX,VICT_DESCENT,latitude,longitude,MTH_OCC,DAY_OCC,YEAR_OCC,WEEKDAY_OCC_ID,WEEKDAY_OCC,TIME_OCC_TYPE_ID,TIME_OCC_TYPE,ID
DATE_OCC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2010-01-07,2005,1,Central,330,BURGLARY FROM VEHICLE,46,M,Hispanic/Latin/Mexican,34.0389,-118.2643,1,7,2010,3,Thursday,1,PM,1
2010-01-14,1445,1,Central,624,BATTERY - SIMPLE ASSAULT,38,F,Black,34.064,-118.2375,1,14,2010,3,Thursday,1,PM,2
2010-01-29,1630,1,Central,330,BURGLARY FROM VEHICLE,25,F,White,34.0454,-118.236,1,29,2010,4,Friday,1,PM,3


In [4]:
df_top5_crime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1022993 entries, 2010-01-07 to 2022-02-08
Data columns (total 18 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   TIME_OCC          1022993 non-null  int64  
 1   AREA              1022993 non-null  int64  
 2   AREA_NAME         1022993 non-null  object 
 3   CRM_CD            1022993 non-null  int64  
 4   CRM_CD_DESC       1022993 non-null  object 
 5   VICT_AGE          1022993 non-null  int64  
 6   VICT_SEX          1022993 non-null  object 
 7   VICT_DESCENT      800204 non-null   object 
 8   latitude          1022993 non-null  float64
 9   longitude         1022993 non-null  float64
 10  MTH_OCC           1022993 non-null  int64  
 11  DAY_OCC           1022993 non-null  int64  
 12  YEAR_OCC          1022993 non-null  int64  
 13  WEEKDAY_OCC_ID    1022993 non-null  int64  
 14  WEEKDAY_OCC       1022993 non-null  object 
 15  TIME_OCC_TYPE_ID  1022993 non-null  int64 

In [5]:
df_top5_crime.describe()

Unnamed: 0,TIME_OCC,AREA,CRM_CD,VICT_AGE,latitude,longitude,MTH_OCC,DAY_OCC,YEAR_OCC,WEEKDAY_OCC_ID,TIME_OCC_TYPE_ID,ID
count,1022993.0,1022993.0,1022993.0,1022993.0,1022993.0,1022993.0,1022993.0,1022993.0,1022993.0,1022993.0,1022993.0,1022993.0
mean,1392.614,10.81163,453.0597,28.84398,34.04551,-118.252,6.554466,15.76682,2016.0,3.016785,0.8837001,511497.0
std,644.2113,5.969563,118.7708,22.3759,1.019937,3.522246,3.445166,8.812633,3.759735,1.977221,0.3205844,295312.8
min,1.0,1.0,310.0,0.0,0.0,-118.8279,1.0,1.0,2010.0,0.0,0.0,1.0
25%,945.0,6.0,330.0,0.0,34.0128,-118.4327,4.0,8.0,2013.0,1.0,1.0,255749.0
50%,1500.0,11.0,440.0,30.0,34.0611,-118.3288,7.0,16.0,2016.0,3.0,1.0,511497.0
75%,1915.0,16.0,510.0,45.0,34.1668,-118.2761,10.0,23.0,2019.0,5.0,1.0,767245.0
max,2359.0,21.0,624.0,118.0,34.7907,0.0,12.0,31.0,2022.0,6.0,1.0,1022993.0


In [6]:
df_top5_crime.shape

(1022993, 18)

### 2.Identify and convert categorical variables into dummy/indicator features.

In [7]:
columns_to_drop = ['ID']
df_top5_crime.drop(columns=columns_to_drop, axis=1, inplace=True)

In [8]:
# Identify categorical columns to convert to dummy variables
categorical_cols = df_top5_crime.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
print(categorical_cols)

['AREA_NAME', 'CRM_CD_DESC', 'VICT_SEX', 'VICT_DESCENT', 'WEEKDAY_OCC', 'TIME_OCC_TYPE']


<b>Encoding categorical variables like VICT_SEX and VICT_DESCENT since some machine learning models require numerical input.Encode categorical variables to be used in some types of modeling later.</b>

In [9]:
#Encode categorical variables
encoder = LabelEncoder()

df_top5_crime['VICT_SEX_Encoded'] = encoder.fit_transform(df_top5_crime['VICT_SEX'])
df_top5_crime['VICT_DESCENT_Encoded'] = encoder.fit_transform(df_top5_crime['VICT_DESCENT'])

Categorical columns are converted to string type for consistency before applying one-hot encoding.

In [10]:
# Convert all categorical columns to strings to ensure consistency
df_top5_crime[categorical_cols] = df_top5_crime[categorical_cols].astype(str)

In [11]:
df_top5_crime.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1022993 entries, 2010-01-07 to 2022-02-08
Data columns (total 19 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   TIME_OCC              1022993 non-null  int64  
 1   AREA                  1022993 non-null  int64  
 2   AREA_NAME             1022993 non-null  object 
 3   CRM_CD                1022993 non-null  int64  
 4   CRM_CD_DESC           1022993 non-null  object 
 5   VICT_AGE              1022993 non-null  int64  
 6   VICT_SEX              1022993 non-null  object 
 7   VICT_DESCENT          1022993 non-null  object 
 8   latitude              1022993 non-null  float64
 9   longitude             1022993 non-null  float64
 10  MTH_OCC               1022993 non-null  int64  
 11  DAY_OCC               1022993 non-null  int64  
 12  YEAR_OCC              1022993 non-null  int64  
 13  WEEKDAY_OCC_ID        1022993 non-null  int64  
 14  WEEKDAY_OCC           10229

<p>Check if there is any duplicate records after removing raw data unique IDs .</p>

In [12]:
#Remove dupliate and keep as new cleaned dataframes
df_top5_crime= df_top5_crime.drop_duplicates(keep='last')

In [13]:
df_top5_crime.shape

(1022993, 19)

One-hot encoding is applied, transforming categorical variables into a set of binary columns, each representing a category's presence or absence.

In [14]:
# Apply one-hot encoding to categorical variables
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
df_dummies = pd.DataFrame(ohe.fit_transform(df_top5_crime[categorical_cols]), columns=ohe.get_feature_names(categorical_cols))

In [15]:
df_dummies.head(5)

Unnamed: 0,AREA_NAME_77th Street,AREA_NAME_Central,AREA_NAME_Devonshire,AREA_NAME_Foothill,AREA_NAME_Harbor,AREA_NAME_Hollenbeck,AREA_NAME_Hollywood,AREA_NAME_Mission,AREA_NAME_N Hollywood,AREA_NAME_Newton,...,VICT_DESCENT_nan,WEEKDAY_OCC_Friday,WEEKDAY_OCC_Monday,WEEKDAY_OCC_Saturday,WEEKDAY_OCC_Sunday,WEEKDAY_OCC_Thursday,WEEKDAY_OCC_Tuesday,WEEKDAY_OCC_Wednesday,TIME_OCC_TYPE_AM,TIME_OCC_TYPE_PM
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [16]:
df_dummies.shape

(1022993, 58)

In [17]:
df_top5_crime = df_top5_crime.reset_index(drop=True)
df_dummies = df_dummies.reset_index(drop=True)

# concatenate the new dummy variables
df_top5_crime = pd.concat([df_top5_crime, df_dummies], axis=1)

In [18]:
df_top5_crime.shape

(1022993, 77)

In [19]:
df_top5_crime.head(3)

Unnamed: 0,TIME_OCC,AREA,AREA_NAME,CRM_CD,CRM_CD_DESC,VICT_AGE,VICT_SEX,VICT_DESCENT,latitude,longitude,...,VICT_DESCENT_nan,WEEKDAY_OCC_Friday,WEEKDAY_OCC_Monday,WEEKDAY_OCC_Saturday,WEEKDAY_OCC_Sunday,WEEKDAY_OCC_Thursday,WEEKDAY_OCC_Tuesday,WEEKDAY_OCC_Wednesday,TIME_OCC_TYPE_AM,TIME_OCC_TYPE_PM
0,2005,1,Central,330,BURGLARY FROM VEHICLE,46,M,Hispanic/Latin/Mexican,34.0389,-118.2643,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1445,1,Central,624,BATTERY - SIMPLE ASSAULT,38,F,Black,34.064,-118.2375,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1630,1,Central,330,BURGLARY FROM VEHICLE,25,F,White,34.0454,-118.236,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


### 3.Standardize the magnitude of numeric features using a scaler.

In [20]:
# Identify numerical columns for scaling
numerical_cols = df_top5_crime.select_dtypes(include=['int64', 'float64']).columns.tolist()
print (numerical_cols)

['TIME_OCC', 'AREA', 'CRM_CD', 'VICT_AGE', 'latitude', 'longitude', 'MTH_OCC', 'DAY_OCC', 'YEAR_OCC', 'WEEKDAY_OCC_ID', 'TIME_OCC_TYPE_ID', 'VICT_SEX_Encoded', 'VICT_DESCENT_Encoded', 'AREA_NAME_77th Street', 'AREA_NAME_Central', 'AREA_NAME_Devonshire', 'AREA_NAME_Foothill', 'AREA_NAME_Harbor', 'AREA_NAME_Hollenbeck', 'AREA_NAME_Hollywood', 'AREA_NAME_Mission', 'AREA_NAME_N Hollywood', 'AREA_NAME_Newton', 'AREA_NAME_Northeast', 'AREA_NAME_Olympic', 'AREA_NAME_Pacific', 'AREA_NAME_Rampart', 'AREA_NAME_Southeast', 'AREA_NAME_Southwest', 'AREA_NAME_Topanga', 'AREA_NAME_Van Nuys', 'AREA_NAME_West LA', 'AREA_NAME_West Valley', 'AREA_NAME_Wilshire', 'CRM_CD_DESC_BATTERY - SIMPLE ASSAULT', 'CRM_CD_DESC_BURGLARY', 'CRM_CD_DESC_BURGLARY FROM VEHICLE', 'CRM_CD_DESC_THEFT PLAIN - PETTY ($950 & UNDER)', 'CRM_CD_DESC_VEHICLE - STOLEN', 'VICT_SEX_F', 'VICT_SEX_M', 'VICT_SEX_Unknown', 'VICT_DESCENT_American Indian/Alaskan Native', 'VICT_DESCENT_Asian Indian', 'VICT_DESCENT_Black', 'VICT_DESCENT_Cambo

Numerical columns are standardized using StandardScaler to ensure uniformity in their magnitude, which is crucial for many machine learning algorithms. But, It's worth noting that some machine learning algorithms, such as decision trees and random forests, are not sensitive to the scale of the input features, so scaling may not be necessary in all cases. However, for algorithms like Support Vector Machines, k-Nearest Neighbors, and neural networks, scaling is usually recommended.

In [21]:
# Apply standard scaling to numerical features

scaler = StandardScaler()
#!df_top5_crime[numerical_cols] = scaler.fit_transform(df_top5_crime[numerical_cols])

scaled_data = scaler.fit_transform(df_top5_crime[numerical_cols])
new_column_names = [col + '_sc' for col in numerical_cols]
df_scaled = pd.DataFrame(scaled_data, columns=new_column_names)


df_top5_crime = pd.concat([df_top5_crime, df_scaled], axis=1)

df_top5_crime.head()

Unnamed: 0,TIME_OCC,AREA,AREA_NAME,CRM_CD,CRM_CD_DESC,VICT_AGE,VICT_SEX,VICT_DESCENT,latitude,longitude,...,VICT_DESCENT_nan_sc,WEEKDAY_OCC_Friday_sc,WEEKDAY_OCC_Monday_sc,WEEKDAY_OCC_Saturday_sc,WEEKDAY_OCC_Sunday_sc,WEEKDAY_OCC_Thursday_sc,WEEKDAY_OCC_Tuesday_sc,WEEKDAY_OCC_Wednesday_sc,TIME_OCC_TYPE_AM_sc,TIME_OCC_TYPE_PM_sc
0,2005,1,Central,330,BURGLARY FROM VEHICLE,46,M,Hispanic/Latin/Mexican,34.0389,-118.2643,...,-0.527651,-0.429537,-0.401668,-0.413561,-0.395942,2.455598,-0.402462,-0.40697,-0.362775,0.362775
1,1445,1,Central,624,BATTERY - SIMPLE ASSAULT,38,F,Black,34.064,-118.2375,...,-0.527651,-0.429537,-0.401668,-0.413561,-0.395942,2.455598,-0.402462,-0.40697,-0.362775,0.362775
2,1630,1,Central,330,BURGLARY FROM VEHICLE,25,F,White,34.0454,-118.236,...,-0.527651,2.328087,-0.401668,-0.413561,-0.395942,-0.407233,-0.402462,-0.40697,-0.362775,0.362775
3,130,1,Central,440,THEFT PLAIN - PETTY ($950 & UNDER),31,M,Hispanic/Latin/Mexican,34.0472,-118.2371,...,-0.527651,-0.429537,-0.401668,-0.413561,2.525623,-0.407233,-0.402462,-0.40697,-0.362775,0.362775
4,1000,1,Central,624,BATTERY - SIMPLE ASSAULT,36,F,Hispanic/Latin/Mexican,34.0382,-118.2665,...,-0.527651,-0.429537,-0.401668,-0.413561,2.525623,-0.407233,-0.402462,-0.40697,2.75653,-2.75653


In [22]:
# Combine with numeric data
df_numeric = df_top5_crime[numerical_cols]
df_processed = pd.concat([df_numeric, df_dummies], axis=1)

In [23]:
df_numeric.head(3)

Unnamed: 0,TIME_OCC,AREA,CRM_CD,VICT_AGE,latitude,longitude,MTH_OCC,DAY_OCC,YEAR_OCC,WEEKDAY_OCC_ID,...,VICT_DESCENT_nan,WEEKDAY_OCC_Friday,WEEKDAY_OCC_Monday,WEEKDAY_OCC_Saturday,WEEKDAY_OCC_Sunday,WEEKDAY_OCC_Thursday,WEEKDAY_OCC_Tuesday,WEEKDAY_OCC_Wednesday,TIME_OCC_TYPE_AM,TIME_OCC_TYPE_PM
0,2005,1,330,46,34.0389,-118.2643,1,7,2010,3,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1445,1,624,38,34.064,-118.2375,1,14,2010,3,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1630,1,330,25,34.0454,-118.236,1,29,2010,4,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [24]:
df_processed.head(3)

Unnamed: 0,TIME_OCC,AREA,CRM_CD,VICT_AGE,latitude,longitude,MTH_OCC,DAY_OCC,YEAR_OCC,WEEKDAY_OCC_ID,...,VICT_DESCENT_nan,WEEKDAY_OCC_Friday,WEEKDAY_OCC_Monday,WEEKDAY_OCC_Saturday,WEEKDAY_OCC_Sunday,WEEKDAY_OCC_Thursday,WEEKDAY_OCC_Tuesday,WEEKDAY_OCC_Wednesday,TIME_OCC_TYPE_AM,TIME_OCC_TYPE_PM
0,2005,1,330,46,34.0389,-118.2643,1,7,2010,3,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1445,1,624,38,34.064,-118.2375,1,14,2010,3,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1630,1,330,25,34.0454,-118.236,1,29,2010,4,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [25]:
# Apply standard scaling to the numerical features
scaler = StandardScaler()
df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])

In [26]:
df_processed.head(3)

Unnamed: 0,TIME_OCC,AREA,CRM_CD,VICT_AGE,latitude,longitude,MTH_OCC,DAY_OCC,YEAR_OCC,WEEKDAY_OCC_ID,...,VICT_DESCENT_nan,WEEKDAY_OCC_Friday,WEEKDAY_OCC_Monday,WEEKDAY_OCC_Saturday,WEEKDAY_OCC_Sunday,WEEKDAY_OCC_Thursday,WEEKDAY_OCC_Tuesday,WEEKDAY_OCC_Wednesday,TIME_OCC_TYPE_AM,TIME_OCC_TYPE_PM
0,0.950598,-1.64361,-1.036111,0.766719,-0.006485,-0.003488,-1.61225,-0.994802,-1.59587,-0.008489,...,-0.527651,-0.429537,-0.401668,-0.413561,-0.395942,2.455598,-0.402462,-0.40697,-0.362775,0.362775
1,0.081318,-1.64361,1.439246,0.409191,0.018125,0.004121,-1.61225,-0.200487,-1.59587,-0.008489,...,-0.527651,-0.429537,-0.401668,-0.413561,-0.395942,2.455598,-0.402462,-0.40697,-0.362775,0.362775
2,0.368491,-1.64361,-1.036111,-0.171791,-0.000112,0.004547,-1.61225,1.501616,-1.59587,0.497271,...,-0.527651,2.328087,-0.401668,-0.413561,-0.395942,-0.407233,-0.402462,-0.40697,-0.362775,0.362775


In [27]:
df_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1022993 entries, 0 to 1022992
Columns: 129 entries, TIME_OCC to TIME_OCC_TYPE_PM
dtypes: float64(129)
memory usage: 1006.8 MB


df_processed is the dataset prepared for general use. However, due to time constraints, not all data will be utilized in the modeling process. Consequently, other aggregated datasets will be prepared, focusing specifically on Crime Category prediction using numerical columns.</br>
1. prepare dataset with total count of crimes with feature columns </br>
2. prepare dataset with total count of crimes pivoted by crime type </br>

In [28]:
df_crimetotal_byvictarea = df_top5_crime.groupby(['AREA', 'VICT_AGE', 'VICT_DESCENT_Encoded', 'VICT_SEX_Encoded', 'WEEKDAY_OCC_ID']).size().reset_index(name='CRIME_Total')
df_crimetotal_byvictarea = df_crimetotal_byvictarea.sort_values(by='CRIME_Total', ascending=False)
df_crimetotal_byvictarea.head()

Unnamed: 0,AREA,VICT_AGE,VICT_DESCENT_Encoded,VICT_SEX_Encoded,WEEKDAY_OCC_ID,CRIME_Total
53954,12,0,19,2,5,2766
53953,12,0,19,2,4,2689
53955,12,0,19,2,6,2564
53949,12,0,19,2,0,2528
57998,13,0,19,2,5,2527


In [34]:
df_crimetotal_bytypevictarea = df_top5_crime.groupby(['CRM_CD','AREA', 'VICT_AGE', 'VICT_DESCENT_Encoded', 'VICT_SEX_Encoded', 'WEEKDAY_OCC_ID']).size().reset_index(name='CRIME_Total')
df_crimetotal_bytypevictarea = df_crimetotal_bytypevictarea.sort_values(by='CRIME_Total', ascending=False)
df_crimetotal_bytypevictarea.head()

Unnamed: 0,CRM_CD,AREA,VICT_AGE,VICT_DESCENT_Encoded,VICT_SEX_Encoded,WEEKDAY_OCC_ID,CRIME_Total
175815,510,12,0,19,2,5,2765
175814,510,12,0,19,2,4,2689
175816,510,12,0,19,2,6,2562
175926,510,13,0,19,2,5,2527
175810,510,12,0,19,2,0,2525


In [35]:
df_crimetotal_bytypevictarea.shape

(240797, 7)

In [36]:
df_top5crimedata_num = df_top5_crime.groupby(['AREA', 'VICT_AGE', 'VICT_DESCENT_Encoded', 'VICT_SEX_Encoded', 'WEEKDAY_OCC_ID', 'MTH_OCC', 'DAY_OCC', 'YEAR_OCC', 'CRM_CD']).size().reset_index(name='Count')
df_top5crimedata_num = df_top5crimedata_num.sort_values(by='Count', ascending=False)


print(df_top5crimedata_num.shape)
df_top5crimedata_num.head(5)



(871089, 10)


Unnamed: 0,AREA,VICT_AGE,VICT_DESCENT_Encoded,VICT_SEX_Encoded,WEEKDAY_OCC_ID,MTH_OCC,DAY_OCC,YEAR_OCC,CRM_CD,Count
211469,6,0,16,2,5,5,30,2020,310,17
2283,1,0,18,1,5,5,30,2020,310,16
520752,13,0,19,2,6,6,14,2020,510,16
519497,13,0,19,2,4,7,17,2020,510,15
556377,14,0,19,2,1,10,12,2021,510,14


In [37]:
df_top5crimedata_pv = df_top5crimedata_num.pivot(index=['AREA', 'VICT_AGE', 'VICT_DESCENT_Encoded', 'VICT_SEX_Encoded', 'WEEKDAY_OCC_ID','MTH_OCC','DAY_OCC','YEAR_OCC'], columns='CRM_CD', values='Count')


print(df_top5crimedata_pv.shape)
df_top5crimedata_pv.head()


(858283, 5)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,CRM_CD,310,330,440,510,624
AREA,VICT_AGE,VICT_DESCENT_Encoded,VICT_SEX_Encoded,WEEKDAY_OCC_ID,MTH_OCC,DAY_OCC,YEAR_OCC,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,0,1,1,2,10,30,2013,,,1.0,,
1,0,2,0,0,3,10,2014,,,,,1.0
1,0,2,0,0,6,11,2012,,,,,1.0
1,0,2,0,0,9,14,2015,,,,,1.0
1,0,2,0,0,9,27,2021,,,,,1.0


<b>Next Step is splitting the dataset into features (X) and the target variable (y), which in this case is CRM_CD_DESC. The dataset is split into feature (X) and target (y) sets.</b>

In [38]:
###Inital try

#! numerical_cols = ['AREA','VICT_AGE','VICT_DESCENT_Encoded','MTH_OCC','DAY_OCC','YEAR_OCC','latitue','longtitude']
#!categorical_cols = ['WEEKDAY_OCC','TIME_OCC_TYPE','AREA_NAME','VICT_SEX','VICT_DESCENT']

#!target_var_cols =['AREA','WEEKDAY_OCC_ID','VICT_AGE','VICT_SEX_Encoded','VICT_DESCENT_Encoded']

#! target_var_cols =['AREA','WEEKDAY_OCC_ID']
#!X = df_top5_crime[numerical_cols]
#! y = df_top5_crime['CRM_CD_DESC']

In [39]:
### 1st - Count prediction/forecast try

numerical_cols = df_crimetotal_byvictarea.select_dtypes(include=['int64', 'float64']).columns.tolist()
print (numerical_cols)

# Splitting the dataset into features and target variable
X = df_crimetotal_byvictarea.drop(columns='CRIME_Total', axis=1)
y = df_crimetotal_byvictarea['CRIME_Total']


['AREA', 'VICT_AGE', 'VICT_DESCENT_Encoded', 'VICT_SEX_Encoded', 'WEEKDAY_OCC_ID', 'CRIME_Total']


In [40]:
### 2nd - Count prediction/forecast try

numerical_cols = df_crimetotal_bytypevictarea.select_dtypes(include=['int64', 'float64']).columns.tolist()
print (numerical_cols)

# Splitting the dataset into features and target variable
X = df_crimetotal_bytypevictarea.drop(columns='CRIME_Total', axis=1)
y = df_crimetotal_bytypevictarea['CRIME_Total']


['CRM_CD', 'AREA', 'VICT_AGE', 'VICT_DESCENT_Encoded', 'VICT_SEX_Encoded', 'WEEKDAY_OCC_ID', 'CRIME_Total']


In [41]:
print (X)

        CRM_CD  AREA  VICT_AGE  VICT_DESCENT_Encoded  VICT_SEX_Encoded  \
175815     510    12         0                    19                 2   
175814     510    12         0                    19                 2   
175816     510    12         0                    19                 2   
175926     510    13         0                    19                 2   
175810     510    12         0                    19                 2   
...        ...   ...       ...                   ...               ...   
114727     330    21        77                    18                 0   
114726     330    21        77                    18                 0   
114725     330    21        77                    18                 0   
114724     330    21        77                    12                 1   
240796     624    21        99                    13                 1   

        WEEKDAY_OCC_ID  
175815               5  
175814               4  
175816               6  
175926     

In [42]:
print (y)

175815    2765
175814    2689
175816    2562
175926    2527
175810    2525
          ... 
114727       1
114726       1
114725       1
114724       1
240796       1
Name: CRIME_Total, Length: 240797, dtype: int64


### 4.Split the dataset into training and testing datasets.

The dataset is further split into training and testing sets, following the common practice of holding out a portion of the data for model evaluation.

In [43]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [44]:
# Output the shapes of the resulting dataframes
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((192637, 6), (48160, 6), (192637,), (48160,))

Save the data for next step , Modeling.

In [45]:
##### saving as Pickle
import pickle

save_datafilepath = "../data/interim/train_test_split.pkl"
with open(save_datafilepath, 'wb') as file:
    pickle.dump([X,y,X_train, X_test, y_train, y_test], file)

In [47]:
filepath = "../data/interim/top5_crime_pre.csv"
df_top5_crime.to_csv(filepath, index=False) 