# Street dataset pre-process before sending to Datawarehouse
***

## Import files 

In [31]:
# import all the libraries that require for project
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os

## GLOBALS

In [32]:
# define paths
INPUT_PATH = "C://SHU/ADMP/Assessment_02/londoncrimedw_project/input_csv/"
OUTPUT_PATH = "C://SHU/ADMP/Assessment_02/londoncrimedw_project/output_csv/"

# create variables to store the datasets name you want to use
STREET_FILENAME = "street_london.csv"
LONDON_BOROUGH_FILENAME = "london_boroughs.csv"

## Functions 

In [33]:
class FILESIZE:
    SMALL = 1
    LARGE = 2
    
def read_csv_file(fileType, fileName):
    if (fileType == FILESIZE.SMALL):
        df_ret = pd.read_csv(fileName)
    else:
        # read the large csv file with specified chunksize of 10 million records
        df_chunk = pd.read_csv(fileName, chunksize=1000000)

        # append each chunk df here
        chunk_list = []   

        # each chunk is in df format
        for chunk in df_chunk:    
            # once the data filtering is done, append the chunk to list
            chunk_list.append(chunk)

        # concat the list into dataframe 
        df_ret = pd.concat(chunk_list)
        
    return df_ret

## Read CSV file

In [34]:
# set a working directory to the location where input dataset csv file exist
os.chdir(INPUT_PATH)

# load dataframe with street data from CSV file 
street_df = read_csv_file(FILESIZE.LARGE, STREET_FILENAME)

# store london borough data into dataframe
# london_borough_df = read_csv_file(FILESIZE.SMALL, LONDON_BOROUGH_FILENAME)

FileNotFoundError: [Errno 2] File london_boroughs.csv does not exist: 'london_boroughs.csv'

## Handle NULL/Missing Values

In [36]:
# 01. Crime_ID: Remove null records from the street data frame.
street_df = street_df[street_df.Crime_ID.isnull() == False]

# 02. Longitude, Latitude: Fill missing values with 0
street_df.Longitude.fillna(0, inplace=True)
street_df.Latitude.fillna(0, inplace=True)

# 03. LSOA_code, LSOA_name: Fill missing values with 'Not Available' text
street_df.LSOA_code.fillna('Not Available', inplace=True)
street_df.LSOA_name.fillna('Not Available', inplace=True)

# 04. Last_outcome_category: Fill missing values with 'Not Available' text
street_df.Last_outcome_category.fillna('Not Available', inplace=True)

# 05. Context: Drop it as all fields are empty here. Hence, no significance
street_df = street_df.drop(['Context'], axis=1)

## Delete columns which are of not much significance
'Reported_by' column is not useful and hence drop it

In [37]:
# drop Reported_by column from the dataset 
street_df = street_df.drop(['Reported_by'], axis=1)

## Derive new column for Borough_name from LSOA_name

In [38]:
# remove last 5 characters from the LSOA_name (4 code and 1 space)
street_df['Borough_name'] = street_df.LSOA_name[(street_df.LSOA_name.isnull()==False)].str[:-5]

## Remove duplication for Crime_ID feature

In [39]:
# Remove all the duplicate records except first instance of it
street_df = street_df.drop_duplicates(subset='Crime_ID', keep='first')

## Split columns from one to two

Split Month column to two columns named, Year and Month.

In [40]:
# Check if the month column have any null value 
number_of_null_months = street_df.Month[street_df.Month.isnull() == True].size

#  Split  Month column to Year and Month only if their is no null value present inside Month
if(number_of_null_months == 0):
    # create Year column inside street dataframe and populate year data from the Month field
    street_df['Year'] = street_df['Month'].str[: 4]
    
    # update Month column with just a Month number
    street_df['Month'] = street_df['Month'].str[-2: ]
    
street_df.head(10)

Unnamed: 0,Crime_ID,Month,Falls_within,Longitude,Latitude,Location,LSOA_code,LSOA_name,Crime_type,Last_outcome_category,Borough_name,Year
0,04f04430a130fa6aa5a5c6ef1021a942c16cca642529dc...,12,City of London Police,0.0,0.0,No Location,Not Available,Not Available,Violence and sexual offences,Under investigation,Not Avai,2019
1,e1bc0b282acfa132e77dadc63efed5a0d22ea969f39e72...,12,City of London Police,0.0,0.0,No Location,Not Available,Not Available,Violence and sexual offences,Under investigation,Not Avai,2019
2,c55afa311916b68a3a3b8ac25ba37d4fcbfbb8ceb7a315...,12,City of London Police,0.0,0.0,No Location,Not Available,Not Available,Violence and sexual offences,Under investigation,Not Avai,2019
3,95cb59cbe07660643794f3286e3e177dd1c99e907aae55...,12,City of London Police,0.0,0.0,No Location,Not Available,Not Available,Violence and sexual offences,Under investigation,Not Avai,2019
4,03f070e00ef7b4e299c95a5f4e2c956587b2748e3380a3...,12,City of London Police,0.0,0.0,No Location,Not Available,Not Available,Violence and sexual offences,Under investigation,Not Avai,2019
5,daabf74598d41568679de0222cd9c9b3a8b2550b83b18a...,12,City of London Police,0.0,0.0,No Location,Not Available,Not Available,Violence and sexual offences,Under investigation,Not Avai,2019
6,4a130ebf01558600886bac2a26d01e76239e45d5b85c29...,12,City of London Police,0.0,0.0,No Location,Not Available,Not Available,Violence and sexual offences,Under investigation,Not Avai,2019
7,d83a8f832de935c83562db8f662830f5d639da7630e23a...,12,City of London Police,0.0,0.0,No Location,Not Available,Not Available,Violence and sexual offences,Under investigation,Not Avai,2019
8,ca109136525fb40a61c7a5d4398ac343baad02fa7c9c6d...,12,City of London Police,0.0,0.0,No Location,Not Available,Not Available,Violence and sexual offences,Under investigation,Not Avai,2019
9,0bf227c136793c7cfcd55eb4f2eedd49679d4f99027a1e...,12,City of London Police,0.0,0.0,No Location,Not Available,Not Available,Violence and sexual offences,Under investigation,Not Avai,2019


## Save processed dataframe to csv file in the output path location

In [42]:
street_df.to_csv(OUTPUT_PATH+"street-staging-data.csv", sep=',', encoding='utf-8', index=None, header = True)