# Cleaning Sales Data

### 1-1. Data cleaning and preparation

Had to change some code to work for both 2017_2018 fiiles and 2019_2021 as there were minor differences. 

In [1]:
import glob

import pandas as pd
import numpy as np 
import seaborn as sns
import re
%matplotlib inline

help from https://stackoverflow.com/questions/14008440/how-to-extract-numbers-from-filename-in-python

In [3]:
def salesCleanFn(dataFolderString,osString):
    
    #__________________________________________________________
    # If operating system is windows
    if osString == 'windows':
        dataDir = "Files\\"
        if dataFolderString[-1] != '\\':
        #if the dataFolderString does not have a forward slash, add a forward slash to the string
            fileNames = glob.glob(dataDir+dataFolderString+'\\'+'*.xlsx')
        else:
            fileNames = glob.glob(dataDir+dataFolderString+'*.xlsx')

    #__________________________________________________________
    # If operating system is mac / linux 
    else:
        dataDir = "Files/"
        if dataFolderString[-1] != '/':
            fileNames = glob.glob(dataDir+dataFolderString+'/'+'*.xlsx')
        else:
            fileNames = glob.glob(dataDir+dataFolderString+'*.xlsx')
#     print(fileNames)
    
    #__________________________________________________________
    # Looping through the fileNames to read the excel sheets. 
    frames = []
    masterDF = []

    for i, fileString in enumerate(fileNames):
        for j in range(0,8):
            df = []
            df = pd.read_excel(fileString, sheet_name="Postcode", na_values='-', header=j)

            if df.columns[0] == 'Postcode': # ...if the 'header' parameter has the correct j value...

                # _____________________________________________________________________________________
                # adding additional columns
                regex = re.compile(r'\d+') #finds all numbers in string
                fileNumbers = regex.findall(fileString) #only works if the format of the 
                # filename is consistent. Stores the numbers in a list.
                
#                 print(fileNumbers)
                df['key'] = 's'+fileNumbers[2] # fileNumbers type = string
                year = fileNumbers[3] #type = string
                
                # the following statement searches for the month in the filename
                # if the find() function does not find the month, it returns '-1'
                # thus the use of !=-1. 
                if fileString.find('mar') !=-1 or fileString.find('Mar') !=-1:
                    quarter = 'Q1'
                elif fileString.find('jun') !=-1 or fileString.find('Jun') !=-1:
                    quarter = 'Q2'
                elif fileString.find('sep') !=-1 or fileString.find('Sep') !=-1:
                    quarter = 'Q3'
                elif fileString.find('dec') !=-1 or fileString.find('Dec') !=-1:
                    quarter = 'Q4'
                df['time_period'] = year + ' ' + quarter

                df['year'] = year

                df['quarter'] = quarter
                
                # some of the columns in the files are not the same, so we fix them here
                column = 'Quarterly change in Median Sales Price'
                newColumns = {'Quarterly change in Median Sales Price':'Qtly change in Median',
                             'Annual change in Median Sales Price':'Annual change in Median',
                             'Quarterly change in Count':'Qtly change in Count'}
        
                if column in df.columns:
                    df.rename(columns = newColumns, inplace= True)
                
                # finally, putting the DF into a list, frames
                frames.extend([df])
                
               


    # _____________________________________________________________________________________
    # putting all the DFs (frames) together to get a master DF
    masterDF = pd.concat(frames)
    # General cleaning
    rename_cols= {'Postcode':'postcode', 
             'Dwelling Type':'dwelling_type', 
             "First Quartile Sales Price\n$'000s" : '25%_price',
             "Median Sales Price\n$'000s" : 'median_price', 
             "Third Quartile Sales Price\n'000s" : '75%_price',
             "Mean Sales Price\n$'000s" : 'mean_price',
             'Sales\nNo.':'sales_no',
             'Qtly change in Median':'Qdelta_median',
             'Annual change in Median':'Adelta_median',
             'Qtly change in Count':'Qdelta_count',
             'Annual change in Count':'Adelta_count'}
    
    masterDF.rename(columns=rename_cols, inplace=True) #rename the columns for easier referencing



    masterDF = masterDF.drop(columns=['25%_price', '75%_price'], axis=1) # dropping unwanted columns
    
    masterDF.loc[masterDF['sales_no'].isnull(), 'sales_no'] = 5.0 #imputing NAN values. 5 is median of 0 and 10 being the 
    # range for null values in the dataset. 
    
    # fixing the NAN values in the median and mean columns 
    keys = list(masterDF['key'].unique())

    for k in keys:
    # Total
    # median
        k_impMedianTotal = masterDF.loc[(masterDF['median_price'].notna()) & 
                             (masterDF['dwelling_type']=='Total') &
                             (masterDF['key']==k),
                             'median_price'].median() # calculate imputer value 

        masterDF.loc[(masterDF['median_price'].isnull()) & 
                     (masterDF['dwelling_type']=='Total') &
                     (masterDF['key']==k),
                     'median_price']=k_impMedianTotal #impute

    # mean
        k_impMeanTotal = masterDF.loc[(masterDF['mean_price'].notna()) & 
                             (masterDF['dwelling_type']=='Total') &
                             (masterDF['key']==k),
                             'median_price'].median()

        masterDF.loc[(masterDF['mean_price'].isnull()) & 
                 (masterDF['dwelling_type']=='Total') &
                 (masterDF['key']==k),
                 'mean_price']=k_impMeanTotal #impute
#         print(k_impMeanTotal)
#         print('')
#         print(k)

    # Strata
    # median
        k_impMedianStrata = masterDF.loc[(masterDF['median_price'].notna()) & 
                             (masterDF['dwelling_type']=='Strata') &
                             (masterDF['key']==k),
                             'median_price'].median()


        masterDF.loc[(masterDF['median_price'].isnull()) & 
                     (masterDF['dwelling_type']=='Strata') &
                     (masterDF['key']==k),
                     'median_price']=k_impMedianStrata

    # mean
        k_impMeanStrata = masterDF.loc[(masterDF['mean_price'].notna()) & 
                             (masterDF['dwelling_type']=='Strata') &
                             (masterDF['key']==k),
                             'mean_price'].median()

        masterDF.loc[(masterDF['mean_price'].isnull()) & 
                     (masterDF['dwelling_type']=='Strata') &
                     (masterDF['key']==k),
                     'mean_price']=k_impMeanStrata

    # Non-Strata
    # median
        k_impMedianNonStrata = masterDF.loc[(masterDF['median_price'].notna()) & 
                             (masterDF['dwelling_type']=='Non Strata') &
                             (masterDF['key']==k),
                             'median_price'].median()

        masterDF.loc[(masterDF['median_price'].isnull()) & 
                     (masterDF['dwelling_type']=='Non Strata') &
                     (masterDF['key']==k),
                     'median_price']=k_impMedianNonStrata

    # mean
        k_impMeanNonStrata = masterDF.loc[(masterDF['mean_price'].notna()) & 
                             (masterDF['dwelling_type']=='Non Strata') &
                             (masterDF['key']==k),
                             'mean_price'].median()

        masterDF.loc[(masterDF['mean_price'].isnull()) & 
                     (masterDF['dwelling_type']=='Non Strata') &
                     (masterDF['key']==k),
                     'mean_price']=k_impMeanNonStrata
        continue

    masterDF.loc[masterDF['sales_no'] == 's', 'sales_no'] = 20.0 # Replace 's' with the median of 
    # 10 and 30 since there are quite a few

    masterDF['sales_no'] = masterDF['sales_no'].astype(float) # Cast type as float

    total = masterDF.loc[masterDF['dwelling_type']=='Total'] # Separate dwelling types
    strata = masterDF.loc[masterDF['dwelling_type']=='Strata']
    nstrata = masterDF.loc[masterDF['dwelling_type']=='Non Strata'] 

    return masterDF, total, strata, nstrata

In [4]:
salesOld, salesOld_total, salesOld_strata, salesOld_nStrata  = salesCleanFn('Sales/2017_2018', 'windows')

In [5]:
salesOld.to_csv('Files/sales_2017_2018')

In [6]:
salesOld_total.to_csv('Files/salesTotal_2017_2018')

In [7]:
salesOld_strata.to_csv('Files/salesStrata_2017_2018')

In [8]:
salesOld_nStrata.to_csv('Files/salesNstrata_2017_2018')

In [9]:
salesNew, salesNew_total, salesNew_strata,  salesNew_nStrata =  salesCleanFn('Sales/2019_2021', 'windows')

In [10]:
salesNew.to_csv('Files/sales_2019_2021')

In [11]:
salesNew_total.to_csv('Files/salesTotal_2019_2021')

In [12]:
salesNew_strata.to_csv('Files/salesStrata_2019_2021')

In [13]:
salesNew_nStrata.to_csv('Files/salesNstrata_2019_2021')

# Cleaning Rent Data

### cleaning function

In [19]:
def rentCleanFn(dataFolderString,osString):
        
    #__________________________________________________________
    # If operating system is windows
    if osString == 'windows':
        dataDir = "Files\\"
        if dataFolderString[-1] != '\\':
        #if the dataFolderString does not have a forward slash, add a forward slash to the string
            fileNames = glob.glob(dataDir+dataFolderString+'\\'+'*.xlsx')
        else:
            fileNames = glob.glob(dataDir+dataFolderString+'*.xlsx')

    #__________________________________________________________
    # If operating system is mac / linux 
    else:
        dataDir = "Files/"
        if dataFolderString[-1] != '/':
            fileNames = glob.glob(dataDir+dataFolderString+'/'+'*.xlsx')
        else:
            fileNames = glob.glob(dataDir+dataFolderString+'*.xlsx')
#     print(fileNames)
    
    #__________________________________________________________
    # Looping through the fileNames to read the excel sheets. 
    frames = []
    masterDF = []
    
    for i, fileString in enumerate(fileNames):
        for j in range(0,8):
            df = []
            df = pd.read_excel(fileString, sheet_name="Postcode", na_values='-', header=j)

            if df.columns[0] == 'Postcode': # ...if the 'header' parameter has the correct j value...

                # adding additional columns
                regex = re.compile(r'\d+') #finds all numbers in string
                fileNumbers = regex.findall(fileString)
                
                df['key'] = 'r'+fileNumbers[2] # fileNumbers type = string
    
                
        
                # some of the columns in the files are not the same, so we fix them here
                column = 'Bedroom Numbers'
                newColumns = {'Bedroom Numbers':'Number of Bedrooms'}
        
                if column in df.columns:
                    df.rename(columns = newColumns, inplace= True)
                
                
                
                frames.extend([df])# putting the DF into a list, frames

            
              



    masterDF = pd.concat(frames)
    
    # droppinig this column as we've confirmed there's an issue with the raw csv file. 
    if 'Unnamed: 10' in masterDF.columns:
        masterDF=  masterDF.drop(columns='Unnamed: 10')

    # Drop unwanted columns
    masterDF = masterDF.drop(columns=['First Quartile Weekly Rent for New Bonds\n$',
                          'Third Quartile Weekly Rent for New Bonds\n$'],
                axis=1)
    
    # Rename columns
    rename_cols= {'Postcode':'postcode',
                  'Dwelling Types':'dwelling_type', 
                  'Number of Bedrooms':'bed_number',
                  'Median Weekly Rent for New Bonds\n$': 'median_rent_newb',
                  'New Bonds Lodged\nNo.' : 'new_bonds_no',
                  'Total Bonds Held\nNo.': 'total_bonds_no',
                  'Quarterly change in Median Weekly Rent':'Qdelta_median_rent',
                  'Annual change in Median Weekly Rent':'Adelta_median_rent',
                  'Quarterly change in New Bonds Lodged':'Qdelta_new_bonds',
                  'Annual change in New Bonds Lodged':'Adelta_new_bonds'}
    
    masterDF.rename(columns=rename_cols,inplace=True)

    masterDF_ag = masterDF.loc[(masterDF['bed_number']=='Total') & (masterDF['dwelling_type']=='Total')]
    masterDF_ag = masterDF_ag.drop(columns=['bed_number','dwelling_type'], axis=1)
    
    # Impute 's' in 'new_bonds_no' and 'total_bonds_no' with 20
    masterDF_ag.loc[masterDF_ag['new_bonds_no']=='s','new_bonds_no'] = 20.0
    masterDF_ag.loc[masterDF_ag['total_bonds_no']=='s', 'total_bonds_no'] = 20.0

    # Impute na in 'new_bonds_no' and 'total_bonds_no' with 5
    masterDF_ag.loc[masterDF_ag['new_bonds_no'].isnull(),'new_bonds_no'] = 5.0
    masterDF_ag.loc[masterDF_ag['total_bonds_no'].isnull(), 'total_bonds_no'] = 5.0

    # Cast both variables as float (was object)
    masterDF_ag['new_bonds_no'] = masterDF_ag['new_bonds_no'].astype(float)
    masterDF_ag['total_bonds_no'] = masterDF_ag['total_bonds_no'].astype(float)

    # Impute na in 'median_rent' with median of the column
    masterDF_ag['median_rent_newb'].fillna(masterDF_ag['median_rent_newb'].median(), inplace=True)
    

    # Set postcode as index
    
    masterDF_ag = masterDF_ag.set_index('postcode')
    return masterDF_ag

In [20]:
rentNew = rentCleanFn('Rent/2019_2021', 'windows')


time taken to run the function: 158.8437283039093


In [21]:
rentOld  = rentCleanFn('Rent/2017_2018', 'windows')

102.35191702842712


In [22]:
rentOld.to_csv('Files/rent_2017_2018')

In [23]:
rentNew.to_csv('Files/rent_2019_2021')