# Outcome dataset pre-process before sending to Datawarehouse
***

## Import files 

In [3]:
# import all the libraries that require for project
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os

## Globals

In [4]:
# define paths
INPUT_PATH = "C://SHU/ADMP/Assessment_02/londoncrimedw_project/input_csv/"
OUTPUT_PATH = "C://SHU/ADMP/Assessment_02/londoncrimedw_project/output_csv/"

# create variables to store the datasets name you want to use
OUTCOME_FILENAME = "outcome_london.csv"
LONDON_BOROUGH_FILENAME = "london_boroughs.csv"

## Functions

In [5]:
class FILESIZE:
    SMALL = 1
    LARGE = 2
    
def read_csv_file(fileType, fileName):
    if (fileType == FILESIZE.SMALL):
        df_ret = pd.read_csv(fileName)
    else:
        # read the large csv file with specified chunksize of 10 million records
        df_chunk = pd.read_csv(fileName, chunksize=1000000)

        # append each chunk df here
        chunk_list = []   

        # each chunk is in df format
        for chunk in df_chunk:    
            # once the data filtering is done, append the chunk to list
            chunk_list.append(chunk)

        # concat the list into dataframe 
        df_ret = pd.concat(chunk_list)
        
    return df_ret

## Read CSV file

In [6]:
# set a working directory to the location where input dataset csv file exist
os.chdir(INPUT_PATH)

# load dataframe with outcome data from CSV file 
outcome_df = read_csv_file(FILESIZE.LARGE, OUTCOME_FILENAME)

## Do sorting on month basis
Outcome dataset sorting on the Month basis is require to bring latest updated outcome on the top for each crime ids and rest will lie beneath. This is important step because only the latest records are going to retain and all previous/historical outcome update would be deleted. 

In [7]:
# Sort the outcome data frame with Month column in descending order to get the latest crime id entry on top
outcome_df = outcome_df.sort_values(by='Month', ascending=False)

# Test if the latest crime id is appearing on the top 
outcome_df[outcome_df.Crime_ID == ('3b60aed0ce6c29f63a00e44822492dcdc419b68a0974e53e6884359dc2aec1aa')].sort_values(by='Month', ascending=False).head(2)

Unnamed: 0,Crime_ID,Month,Reported_by,Falls_within,Longitude,Latitude,Location,LSOA_code,LSOA_name,Outcome_type
645183,3b60aed0ce6c29f63a00e44822492dcdc419b68a0974e5...,2018-03,Metropolitan Police Service,Metropolitan Police Service,-0.030748,51.367789,On or near SHRUBLANDS AVENUE,E01001128,Croydon 025B,Defendant found not guilty
648225,3b60aed0ce6c29f63a00e44822492dcdc419b68a0974e5...,2018-03,Metropolitan Police Service,Metropolitan Police Service,-0.030748,51.367789,On or near SHRUBLANDS AVENUE,E01001128,Croydon 025B,Offender given community sentence


## Remove duplicate Crime IDs so that only the recent outcome update will be availble in the dataset

In [8]:
# Remove all the duplicate records except first instance of it
outcome_df = outcome_df.drop_duplicates(subset='Crime_ID', keep='first')

## Rename 'Outcome_type' column to 'Latest_Outcome_type' for better readability

In [9]:
# Rename the Outcome_type to Latest_outcome_type
outcome_df.rename(columns = {'Outcome_type':'Latest_Outcome_type'}, inplace = True)

## Extract just the required columns from the outcome dataset

In [11]:
# Need only two columns from the dataset and hence extract the required ones
outcome_df = outcome_df[['Crime_ID','Latest_Outcome_type']]

## Save processed dataframe to csv file in the output path location

In [12]:
outcome_df.to_csv(OUTPUT_PATH+"outcome-staging-data.csv", sep=',', encoding='utf-8', index=None, header = True)