# Pre-Process London Crime datasets (Street, Outcome and London-Boroughs)
Street dataset pre-process before sending to Datawarehouse
1. London Crime Street Dataset
2. London Crime Outcome Dataset
***

## Common activities
### Import libraries 

In [1]:
# import all the libraries that require for project
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os

### Declare globals

In [3]:
# define paths
INPUT_PATH = "C://SHU/ADMP/Assessment_02/github-repos/admp-csvs/prod/"
OUTPUT_PATH = "C://SHU/ADMP/Assessment_02/github-repos/admp-csvs/stage/"

# create variables to store the datasets name you want to use
STREET_FILENAME = "street-london.csv"
OUTCOME_FILENAME = OUTPUT_PATH+"outcome-london.csv"
BOROUGH_FILENAME = "boroughs-london.csv"

### Declare and define functions 

In [4]:
class FILESIZE:
    SMALL = 1
    LARGE = 2
    
def read_csv_file(fileType, fileName):
    if (fileType == FILESIZE.SMALL):
        df_ret = pd.read_csv(fileName)
    else:
        # read the large csv file with specified chunksize of 10 million records
        df_chunk = pd.read_csv(fileName, chunksize=1000000)

        # append each chunk df here
        chunk_list = []   

        # each chunk is in df format
        for chunk in df_chunk:    
            # once the data filtering is done, append the chunk to list
            chunk_list.append(chunk)

        # concat the list into dataframe 
        df_ret = pd.concat(chunk_list)
        
    return df_ret

def test_duplicate():
    # check what is the differences appearing inside the duplicate record by passsing one Crime ID
    test_crime_id1 = '077db29e732f8fbfe793724d63483d23bef1990d3059ed42ed17d8cadcdc8eed'
    test_crime_id2 = '459a973a05867f80e03e312219add2c2d68b13c962b6b0aff6a788114d046c9e'
    test_crime_id3 = '584697dec6313c3f687d97c2714f80a0ab2454119816ec95fbc5d92366985aa7'
    test_crime_id4 = 'e22dfff770cd32f54941621e0b4a1000168dfa2fbfbd6afc42ac1304a18535ce'
    test_crime_id5 = '7a99fcf8b97b8db7f7cee3661dc335bd2d1a41643adc5e91405e2b7233e06d85'

    # outcome_df[street_df.Crime_ID.str.contains('3b60aed0ce6c29f63a00e44822492dcdc419b68a0974e5', na=True)]
    rec1_dup_cnt = street_df[street_df.Crime_ID == test_crime_id1].shape[0]
    rec2_dup_cnt = street_df[street_df.Crime_ID == test_crime_id2].shape[0]
    rec3_dup_cnt = street_df[street_df.Crime_ID == test_crime_id3].shape[0]
    rec4_dup_cnt = street_df[street_df.Crime_ID == test_crime_id4].shape[0]
    rec5_dup_cnt = street_df[street_df.Crime_ID == test_crime_id5].shape[0]

    # return counts
    print('First crime id duplicate record count: ', rec1_dup_cnt)
    print('Second crime id duplicate record count: ', rec2_dup_cnt)
    print('Third crime id duplicate record count: ', rec3_dup_cnt)
    print('Fourth crime id duplicate record count: ', rec4_dup_cnt)
    print('Fifth crime id duplicate record count: ', rec5_dup_cnt)

def get_total_records_cnt():
    return street_df.shape[0]

def get_dataset_cnt_summary(): 
    # Check total unique crimes available in the dataset 
    total_records = len(street_df.Crime_ID)
    total_unique_records = len(street_df.Crime_ID.unique())
    print("Total number of incidents confirmed as a crime (having duplicates): "+str(total_records))
    print("Total unique number of incidents confirmed as a crime: "+str(total_unique_records))
    print("Total number of duplicate incidents removed from the dataset: "+str(total_records-total_unique_records))


### Read CSV files and store them in respective data frames

In [5]:
# set a working directory to the location where input dataset csv file exist
os.chdir(INPUT_PATH)

# load dataframe with street data from CSV file 
street_df = read_csv_file(FILESIZE.LARGE, STREET_FILENAME)

# load dataframe with outcome data from CSV file 
outcome_df = read_csv_file(FILESIZE.LARGE, OUTCOME_FILENAME)

# store london borough data into dataframe
london_borough_df = read_csv_file(FILESIZE.SMALL, BOROUGH_FILENAME)

## Read ASB (Anti social behaviour) records then delete it
These are the records which are having a missing Null values 

In [14]:
street_df.isnull().sum()
# street_df.Crime_ID.value_counts(dropna=False)

Crime_ID                  689612
Month                          0
Reported_by                    0
Falls_within                   0
Longitude                  40456
Latitude                   40456
Location                       0
LSOA_code                  40456
LSOA_name                  40456
Crime_type                     0
Last_outcome_category     689612
Context                  3229932
dtype: int64

In [15]:
street_df.info()

# Check all columns if have values and in range
# street_df.isna().sum()

# check for valid Crime_ID and get all other records based on this id

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3229932 entries, 0 to 3229931
Data columns (total 12 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Crime_ID               object 
 1   Month                  object 
 2   Reported_by            object 
 3   Falls_within           object 
 4   Longitude              float64
 5   Latitude               float64
 6   Location               object 
 7   LSOA_code              object 
 8   LSOA_name              object 
 9   Crime_type             object 
 10  Last_outcome_category  object 
 11  Context                float64
dtypes: float64(3), object(9)
memory usage: 295.7+ MB


## Intial dataset state/count of the dataset

In [31]:
get_dataset_cnt_summary()

Total number of incidents confirmed as a crime (having duplicates): 3229932
Total unique number of incidents confirmed as a crime: 2411837
Total number of duplicate incidents removed from the dataset: 818095


## Start Pre-Processing for London Crime Street Dataset

### Handle NULL/Missing Values

In [38]:
# 01. Crime_ID: Remove null records from the street data frame.
street_df = street_df[street_df.Crime_ID.isnull() == False]

# 02. Longitude, Latitude: Fill missing values with 0
street_df.Longitude.fillna(0, inplace=True)
street_df.Latitude.fillna(0, inplace=True)

# 03. LSOA_code, LSOA_name: Fill missing values with 'Not Available' text
street_df.LSOA_code.fillna('Not Available', inplace=True)
street_df.LSOA_name.fillna('Not Available', inplace=True)

# 04. Last_outcome_category: Fill missing values with 'Not Available' text
street_df.Last_outcome_category.fillna('Not Available', inplace=True)

# 05. Context: Drop it as all fields are empty here. Hence, no significance
street_df = street_df.drop(['Context'], axis=1)

### Dataset state - after null Crime IDs removal 

In [33]:
get_dataset_cnt_summary()

Total number of incidents confirmed as a crime (having duplicates): 2540320
Total unique number of incidents confirmed as a crime: 2411836
Total number of duplicate incidents removed from the dataset: 128484


### Delete columns which are of not much significance
'Reported_by' column is not useful and hence drop it

In [39]:
# drop Reported_by column from the dataset 
street_df = street_df.drop(['Reported_by'], axis=1)

# add empty column to the pandas for storeing the Crime Status (Open or Close)
street_df['Crime_status'] = street_df.apply(lambda _: '', axis=1)

In [40]:
street_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2540320 entries, 0 to 3229931
Data columns (total 11 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Crime_ID               object 
 1   Month                  object 
 2   Falls_within           object 
 3   Longitude              float64
 4   Latitude               float64
 5   Location               object 
 6   LSOA_code              object 
 7   LSOA_name              object 
 8   Crime_type             object 
 9   Last_outcome_category  object 
 10  Crime_status           object 
dtypes: float64(2), object(9)
memory usage: 232.6+ MB


### Derive new column for Borough_name from LSOA_name

In [41]:
# remove last 5 characters from the LSOA_name (4 code and 1 space)
street_df['Borough_name'] = street_df.LSOA_name[(street_df.LSOA_name.isnull()==False)].str[:-5]

# Remove duplication for Crime_ID feature
***

### Check the counts of the records in the dataset post duplicate removal

In [10]:
get_dataset_cnt_summary()

Total number of incidents confirmed as a crime (having duplicates): 2540320
Total unique number of incidents confirmed as a crime: 2411836
Total number of duplicate incidents removed from the dataset: 128484


#### Do testing before duplication removal

In [42]:
test_duplicate()

First crime id duplicate record count:  32
Second crime id duplicate record count:  32
Third crime id duplicate record count:  30
Fourth crime id duplicate record count:  29
Fifth crime id duplicate record count:  25


#### Do sorting in descending order month basis

In [43]:
street_df = street_df.sort_values(by='Month', ascending=False)

In [44]:
# Remove all the duplicate records except first instance of it
street_df = street_df.drop_duplicates(subset='Crime_ID', keep='first')

In [48]:
test_duplicate()
# street_df.isna().sum()

First crime id duplicate record count:  1
Second crime id duplicate record count:  1
Third crime id duplicate record count:  1
Fourth crime id duplicate record count:  1
Fifth crime id duplicate record count:  1


#### Check the counts of the records in the dataset post duplicate removal

In [49]:
get_dataset_cnt_summary()

Total number of incidents confirmed as a crime (having duplicates): 2411836
Total unique number of incidents confirmed as a crime: 2411836
Total number of duplicate incidents removed from the dataset: 0


### Split columns from one to two

Split Month column to two columns named, Year and Month.

In [50]:
# Rename the Outcome_type to Latest_outcome_type
street_df.rename(columns = {'Month':'Crime_month'}, inplace = True)
    
# create Year column inside street dataframe and populate year data from the Month field
street_df['Crime_year'] = street_df['Crime_month'].str[: 4]

# update Month column with just a Month number
street_df['Crime_month'] = street_df['Crime_month'].str[: 4]+street_df['Crime_month'].str[-2: ]


In [51]:
street_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2411836 entries, 0 to 86417
Data columns (total 13 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Crime_ID               object 
 1   Crime_month            object 
 2   Falls_within           object 
 3   Longitude              float64
 4   Latitude               float64
 5   Location               object 
 6   LSOA_code              object 
 7   LSOA_name              object 
 8   Crime_type             object 
 9   Last_outcome_category  object 
 10  Crime_status           object 
 11  Borough_name           object 
 12  Crime_year             object 
dtypes: float64(2), object(11)
memory usage: 257.6+ MB


In [11]:
# STAGING_PATH = "C://SHU/ADMP/Assessment_02/github-repos/admp-csvs/staging/"
# STAGE_FILENAME = "street-staging.csv"
# street_df.to_csv(STAGING_PATH+STAGE_FILENAME, sep=',', encoding='utf-8', index=None, header = True)

## Integrate "London Borough CSV" with "Street Crime CSV"
***

### Check how many unique boroughs are present in total inside dataset

In [15]:
# Get unique london-boroghs numbers
street_df.Borough_name.unique().size

331

### Integration key used to merge between two datasets is 'Borough_name'

In [52]:
# Merge london_borough with the street data to get relevant records only
street_df = pd.merge(left=street_df, right=london_borough_df, left_on='Borough_name', right_on='Name')

# Remove un-necessary columns after merging action
street_df = street_df.drop(['Id', 'Name'], axis=1)

### Check if 33 boroughs present or not in the dataset

In [17]:
# Get unique london-boroghs numbers
street_df.Borough_name.unique().size

33

### Check dataset state - post removal of non-relevant boroughs from the dataset

In [18]:
get_dataset_cnt_summary()

Total number of incidents confirmed as a crime (having duplicates): 2368939
Total unique number of incidents confirmed as a crime: 2368939
Total number of duplicate incidents removed from the dataset: 0


## Start Pre-Processing for London Crime Outcome Dataset

In [19]:
outcome_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 977637 entries, 0 to 977636
Data columns (total 3 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   Crime_ID             977637 non-null  object
 1   Outcome_month        977637 non-null  object
 2   Latest_Outcome_type  977637 non-null  object
dtypes: object(3)
memory usage: 22.4+ MB


## Integrate "London Borough CSV" with "Outcome CSV"
***
### Integration key used to merge between two dataset is 'Crime_ID'

#### get total record count of street dataset before merging it with outcome

In [194]:
# Get unique street numbers
print('Total dataset record count (before merging with outcome dataset):',get_total_records_cnt())

# Get the total number of features before merge
print('Total feature count (before merging with outcome dataset):',street_df.shape[1])

# Column list
[street_df.columns]

Total dataset record count (before merging with outcome dataset): 2368947
Total feature count (before merging with outcome dataset): 14


[Index(['Crime_ID', 'Month', 'Falls_within', 'Longitude', 'Latitude',
        'Location', 'LSOA_code', 'LSOA_name', 'Crime_type',
        'Last_outcome_category', 'Crime_status', 'Borough_name', 'Crime_year',
        'Crime_month'],
       dtype='object')]

### Do the merging now

In [53]:
# Do the left outer joining so that all street data will be retained along with the joined outcome data
street_df = pd.merge(left=street_df, right=outcome_df, on='Crime_ID', how='left')

### Get total record count of street dataset after merging it with outcome

In [21]:
# Get unique street numbers
print('Total dataset record count (before merging with outcome dataset):',get_total_records_cnt())

# Get the total number of features before merge
print('Total feature count (before merging with outcome dataset):',street_df.shape[1])

# Column list
[street_df.columns]

Total dataset record count (before merging with outcome dataset): 2368939
Total feature count (before merging with outcome dataset): 16


[Index(['Crime_ID', 'Crime_month', 'Falls_within', 'Longitude', 'Latitude',
        'Location', 'LSOA_code', 'LSOA_name', 'Crime_type',
        'Last_outcome_category', 'Context', 'Crime_status', 'Borough_name',
        'Crime_year', 'Outcome_month', 'Latest_Outcome_type'],
       dtype='object')]

### Check last outcome type possible values for all records that do not considered as crime so far
Means these records having a NULL Latest_Outcome_type. These are the records for which outcome dataset do not have the Crime_IDs. When apply left outer join, these are the left records.

In [26]:
street_df['Last_outcome_category'][street_df.Latest_Outcome_type.isnull() == True].value_counts()

Status update unavailable                              1086671
Under investigation                                     304526
Investigation complete; no suspect identified               84
Court result unavailable                                    21
Unable to prosecute suspect                                  5
Further investigation is not in the public interest          2
Formal action is not in the public interest                  1
Name: Last_outcome_category, dtype: int64

In [27]:
# Check all the status possible 
street_df['Last_outcome_category'].value_counts()

Status update unavailable                              1119062
Investigation complete; no suspect identified           693686
Under investigation                                     305643
Local resolution                                         48660
Court result unavailable                                 47148
Offender given a caution                                 28855
Awaiting court outcome                                   24295
Offender sent to prison                                  14141
Offender given penalty notice                            12904
Defendant found not guilty                               12526
Offender given community sentence                        11899
Offender fined                                            9844
Offender given suspended prison sentence                  6152
Court case unable to proceed                              6065
Offender given conditional discharge                      4313
Unable to prosecute suspect                            

In [28]:
# check all the records with empty Latest_Outcome_type
street_df[['Crime_month','Outcome_month','Last_outcome_category', 'Latest_Outcome_type']][street_df.Latest_Outcome_type.isnull() == True]

Unnamed: 0,Crime_month,Outcome_month,Last_outcome_category,Latest_Outcome_type
0,2019-12,,Under investigation,
1,2019-12,,Under investigation,
2,2019-12,,Under investigation,
3,2019-12,,Under investigation,
4,2019-12,,Under investigation,
...,...,...,...,...
2368937,2017-01,,Status update unavailable,
2368938,2017-01,,Status update unavailable,
2368939,2017-01,,Status update unavailable,
2368940,2017-01,,Status update unavailable,


In [54]:
# check all the records with non-empty Latest_Outcome_type. Compare outcome types
# street_df[['Crime_month','Outcome_month','Last_outcome_category', 'Latest_Outcome_type']][street_df.Latest_Outcome_type.isnull() == False]

# Fill Crime status data based on the merged outcome data. Crime id which are present in the outcome dataset are the closed cases.
street_df.loc[street_df.Latest_Outcome_type.isnull() == False, 'Crime_status'] = "Close"
street_df.loc[street_df.Latest_Outcome_type.isnull() == True, 'Crime_status'] = "Open"
street_df.Crime_status.value_counts()

Open     1391310
Close     977637
Name: Crime_status, dtype: int64

In [104]:
# Check dataset before changes
street_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2368947 entries, 0 to 2368946
Data columns (total 16 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   Crime_ID               object 
 1   Crime_month            object 
 2   Falls_within           object 
 3   Longitude              float64
 4   Latitude               float64
 5   Location               object 
 6   LSOA_code              object 
 7   LSOA_name              object 
 8   Crime_type             object 
 9   Last_outcome_category  object 
 10  Crime_status           object 
 11  Borough_name           object 
 12  Year                   object 
 13  Month                  object 
 14  Outcome_month          object 
 15  Latest_Outcome_type    object 
dtypes: float64(2), object(14)
memory usage: 307.3+ MB


In [55]:
# Replace all nulls inside Latest_Outcome_type with the Last_outcome_category
street_df.Latest_Outcome_type.fillna(street_df['Last_outcome_category'], inplace=True)

# Rename Latest_Outcome_type to Crime_outcome
street_df.rename(columns = {'Latest_Outcome_type':'Crime_outcome'}, inplace = True)

# Now remove Last_outcome_category feature as it is duplicate entry for crime outcome which gives inconsistent result
street_df = street_df.drop(['Last_outcome_category'], axis=1)

In [25]:
# Check dataset if changes happened or not
street_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2368939 entries, 0 to 2368938
Data columns (total 15 columns):
 #   Column         Dtype  
---  ------         -----  
 0   Crime_ID       object 
 1   Crime_month    object 
 2   Falls_within   object 
 3   Longitude      float64
 4   Latitude       float64
 5   Location       object 
 6   LSOA_code      object 
 7   LSOA_name      object 
 8   Crime_type     object 
 9   Context        float64
 10  Crime_status   object 
 11  Borough_name   object 
 12  Crime_year     object 
 13  Outcome_month  object 
 14  Crime_outcome  object 
dtypes: float64(3), object(12)
memory usage: 289.2+ MB


## Create and update new column called Justice_duration

In [230]:
# Rename Latest_Outcome_type to Crime_outcome
street_df.rename(columns = {'Outcome_month':'Month_from_out'}, inplace = True)

number_of_null_months = street_df.Month_from_out [street_df.Month_from_out.isnull() == True].size
#  Split  Month column to Year and Month only if their is no null value present inside Month
if(number_of_null_months == 0):   
    # create Year column inside street dataframe and populate year data from the Month field
    street_df['Outcome_year'] = street_df['Month_from_out'].str[: 4]
    
    # update Month column with just a Month number
    street_df['Outcome_month'] = street_df['Month_from_out'].str[-2: ]
    
street_df.Month_from_out.value_counts()

-1         1391310
2018-10      60139
2019-01      45500
2017-07      41330
2018-07      40504
2017-08      38444
2017-05      38439
2018-08      38126
2019-02      37729
2017-06      37578
2017-09      37337
2017-10      37279
2018-11      36466
2018-01      35668
2017-03      34216
2017-04      33946
2019-03      32610
2017-11      31877
2018-04      31091
2017-12      29574
2018-03      28724
2018-12      28116
2018-02      27972
2018-09      25677
2018-05      24944
2017-02      22374
2017-01      14983
2018-06      14638
2019-05       9763
2019-04       8484
2019-07       8284
2019-06       7966
2019-10       7907
2019-08       7770
2019-11       7679
2019-09       7530
2019-12       6973
Name: Month_from_out, dtype: int64

In [232]:
import numpy as np

# Create new empty column named 'Outcome_duration' to store duration when crime gets the outcome
street_df['Outcome_duration'] = street_df.apply(lambda _: '', axis=1)

# Update 'Outcome_duration' with the appropriate values 
street_df['Month'] = street_df['Month'].str[: 4]+street_df['Month'].str[-2: ]
street_df['Month_from_out'] = street_df['Month_from_out'].str[: 4]+street_df['Month_from_out'].str[-2: ]
street_df['Month'] = pd.to_datetime(street_df['Month'], format = '%Y%m') - pd.offsets.MonthBegin()
street_df['Month_from_out'] = pd.to_datetime(street_df['Month_from_out'], format='%Y%m')
street_df.loc[street_df.Crime_status == "Close", 'Outcome_duration'] = round((street_df['Month_from_out'] - street_df['Month'])/np.timedelta64(1, 'M'), 0)
street_df.loc[street_df.Crime_status == "Open", 'Outcome_duration'] = -1

## Drop 'Outcome_month' and 'Crime_month' feature

In [56]:
# Now remove 'Month'
# street_df = street_df.drop(['Month_from_out'], axis=1)
# street_df = street_df.drop(['Month'], axis=1)

street_df = street_df.drop(['Outcome_month'], axis=1)



In [57]:
# street_df.Outcome_duration.value_counts()
street_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2368947 entries, 0 to 2368946
Data columns (total 13 columns):
 #   Column         Dtype  
---  ------         -----  
 0   Crime_ID       object 
 1   Crime_month    object 
 2   Falls_within   object 
 3   Longitude      float64
 4   Latitude       float64
 5   Location       object 
 6   LSOA_code      object 
 7   LSOA_name      object 
 8   Crime_type     object 
 9   Crime_status   object 
 10  Borough_name   object 
 11  Crime_year     object 
 12  Crime_outcome  object 
dtypes: float64(2), object(11)
memory usage: 253.0+ MB


In [58]:
# Test if there is no record is there with null value
street_df.isna().sum()

# street_df.Outcome_month.value_counts()

Crime_ID         0
Crime_month      0
Falls_within     0
Longitude        0
Latitude         0
Location         0
LSOA_code        0
LSOA_name        0
Crime_type       0
Crime_status     0
Borough_name     0
Crime_year       0
Crime_outcome    0
dtype: int64

In [236]:
# replace null values with -1 for 'Outcome_year' 
street_df.Outcome_year.fillna('-1', inplace=True)

# replace null values with -1 for 'Outcome_month' 
street_df.Outcome_month.fillna('-1', inplace=True)  

In [237]:
# Test if there is no record is there with null value
street_df.isna().sum()

Crime_ID            0
Falls_within        0
Longitude           0
Latitude            0
Location            0
LSOA_code           0
LSOA_name           0
Crime_type          0
Crime_status        0
Borough_name        0
Crime_year          0
Crime_month         0
Crime_outcome       0
Outcome_year        0
Outcome_month       0
Outcome_duration    0
dtype: int64

In [65]:
street_df.Crime_month.value_counts()

201903    73572
201910    71541
201905    70383
201906    69735
201811    69703
201810    69388
201807    69301
201907    69256
201911    68773
201707    68641
201710    68234
201904    67977
201901    67773
201711    67536
201812    66729
201705    66427
201703    66045
201805    66002
201912    65918
201902    65877
201708    65724
201706    65423
201806    65393
201801    64679
201808    64528
201809    64271
201908    63891
201709    63812
201909    63324
201803    62816
201804    62269
201712    61936
201704    58845
201802    58613
201701    58139
201702    56473
Name: Crime_month, dtype: int64

## Save the clean file now to as a staging csv for street, outcome and boroughs

In [66]:
street_df.to_csv(OUTPUT_PATH+STREET_FILENAME, sep=',', encoding='utf-8', index=None, header = True)
print('File', STREET_FILENAME, 'saved now to location',OUTPUT_PATH,'!')

File street-london.csv saved now to location C://SHU/ADMP/Assessment_02/github-repos/admp-csvs/stage/ !


In [156]:
# import numpy as np
# temp_df = pd.DataFrame()
# temp_df['Crime_month'] = street_df['Crime_month'].str[: 4]+street_df['Crime_month'].str[-2: ]
# temp_df['Outcome_month'] = street_df['Outcome_month'].str[: 4]+street_df['Outcome_month'].str[-2: ]
# temp_df['Crime_month'] = pd.to_datetime(temp_df['Crime_month'], format = '%Y%m') - pd.offsets.MonthBegin()
# temp_df['Outcome_month'] = pd.to_datetime(temp_df['Outcome_month'], format='%Y%m')
# temp_df['Processed_time'] = round((temp_df['Outcome_month'] - temp_df['Crime_month'])/np.timedelta64(1, 'M'), 0)
# # temp_df['Processed_time'] = street_df[street_df.Crime_status]
# temp_df[['Processed_time']]

In [157]:
# temp_df[['Processed_time']][temp_df.Processed_time.isnull() == False].sort_values(by='Processed_time', ascending=False)

In [72]:
temp_df.Processed_time.value_counts()

1.0     287109
2.0     239055
3.0     130299
4.0      62278
1.9      58582
         ...  
34.0         3
31.0         3
30.9         3
31.9         2
36.0         1
Name: Processed_time, Length: 64, dtype: int64

In [34]:
street_df['Crime_month'].str[: 4]+street_df['Crime_month'].str[-2: ]

0          201912
1          201912
2          201912
3          201912
4          201912
            ...  
2368942    201701
2368943    201701
2368944    201701
2368945    201701
2368946    201701
Name: Crime_month, Length: 2368947, dtype: object

In [160]:
# temp_df.Crime_month.isna().sum() 
# temp_df.Outcome_month.isna().sum() 

### Replace all empty/null 'Latest_outcome_type' with the 'Last_outcome_category' values

In [16]:
# replace null values with 'Last_outcome_category' text
street_df.Latest_Outcome_type.fillna(street_df['Last_outcome_category'], inplace=True)

### Remove "Last_outcome_category" and retain the "Latest outcome category" only

In [17]:
# Drop Last_outcome_category
street_df = street_df.drop(['Last_outcome_category'], axis=1)

### Rename the "Crime_ID" to "Crime_hashcode"

In [10]:
street_df.rename(columns = {'Crime_ID':'Crime_hashcode'}, inplace = True)

## Save processed dataframe to csv file in the output path location

In [19]:
street_df.to_csv(OUTPUT_PATH+"street-staging-data.csv", sep=',', encoding='utf-8', index=None, header = True)

# TEST OPERATIONS
***

In [53]:
test_crime_id1 = '077db29e732f8fbfe793724d63483d23bef1990d3059ed42ed17d8cadcdc8eed'
test_crime_id2 = '459a973a05867f80e03e312219add2c2d68b13c962b6b0aff6a788114d046c9e'
test_crime_id3 = '584697dec6313c3f687d97c2714f80a0ab2454119816ec95fbc5d92366985aa7'
test_crime_id4 = 'e22dfff770cd32f54941621e0b4a1000168dfa2fbfbd6afc42ac1304a18535ce'
test_crime_id5 = '7a99fcf8b97b8db7f7cee3661dc335bd2d1a41643adc5e91405e2b7233e06d85'

sub_df = pd.DataFrame()
sub_df = street_df.sort_values(by='Month', ascending=False)
sub_df(sub_df.Crime_ID == test_crime_id1)
# sub_df = street_df.drop_duplicates(subset='Crime_ID', keep='last')

TypeError: 'DataFrame' object is not callable