#  Customer Address

# 1.0 Library

In [1]:
import pandas as pd

# 2.0: Load Data
The data is loaded from the Excel file 'Raw_data.xlsx' with the sheet name 'CustomerAddress'.

In [2]:
def main():
    file_path = 'Raw_data.xlsx'
    sheet_name = 'CustomerAddress'
    cust_address = load_data(file_path, sheet_name)
    return cust_address

In [3]:
def load_data(file_path, sheet_name):
    return pd.read_excel(file_path, sheet_name)

# 3.0 Initial Data Information
Displays basic information about the dataset, including:
■ Number of rows and columns
■ Data types of each column
■ Memory usage

In [4]:
cust_address = main()
print("\nInitial Information of Customer Address Data:")
print("\n")
cust_address_info = cust_address.info()


Initial Information of Customer Address Data:


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3999 entries, 0 to 3998
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   customer_id         3999 non-null   int64 
 1   address             3999 non-null   object
 2   postcode            3999 non-null   int64 
 3   state               3999 non-null   object
 4   country             3999 non-null   object
 5   property_valuation  3999 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 187.6+ KB


# 4.0 Clean State Column
Replaces full state names with abbreviations for consistency ("New South Wales" to "NSW", "Victoria" to "VIC").

In [5]:
def replace_state_names(state_name):
    if state_name == 'New South Wales':
        return 'NSW'
    elif state_name == 'Victoria':
        return 'VIC'
    else:
        return state_name

In [6]:
def clean_state_column(df):
    df['state'] = df['state'].apply(replace_state_names)
    return df

In [7]:
cust_address = clean_state_column(cust_address)

# 5.0 Missing Values Check
Check for missing values in the dataset.

In [8]:
def check_missing_values(df):
    return df.isnull().sum()

In [9]:
missing_values_result = check_missing_values(cust_address)
print("Missing Values Check:")
print(missing_values_result)

Missing Values Check:
customer_id           0
address               0
postcode              0
state                 0
country               0
property_valuation    0
dtype: int64


# 6.0 Consistency Check
Includes an implementation of a placeholder for consistency checks.

In [10]:
def consistency_check(df):
    print("Consistency check result: Placeholder for implementation.")

In [11]:
print("Consistency Check:")
consistency_check(cust_address)

Consistency Check:
Consistency check result: Placeholder for implementation.


# 7.0 Duplication Check
Check for duplicated rows based on the 'customer_id' column.

In [12]:
def duplication_check(df, primary_key='customer_id'):
    duplicated_rows = df[df.duplicated(subset=primary_key, keep=False)]
    return duplicated_rows

In [13]:
duplicated_rows_result = duplication_check(cust_address)
print("Duplication Check:")
print(duplicated_rows_result)

Duplication Check:
Empty DataFrame
Columns: [customer_id, address, postcode, state, country, property_valuation]
Index: []


# 8.0 Export Cleaned Data
Export the cleaned dataset to 'Cleaned_Customer_Address_Dataset.csv'.

In [14]:
def export_data(df, filename='output.csv'):
    df.to_csv(filename, index=False)
    print("     ")
    print(f"Data exported to {filename}")

In [15]:
export_data(cust_address, filename='Cleaned_Customer_Address_Dataset.csv')

     
Data exported to Cleaned_Customer_Address_Dataset.csv
