# Customer Demographic Data Cleaning and Analysis

# 1.0 Import Libraries
■ Pandas for data manipulation

■ NumPy for numerical operations

■ datetime for date-related operations

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, date

# 2.0 Load Data

Loads customer demographic data from the Excel file 'Raw_data.xlsx', sheet 'CustomerDemographic'

In [2]:
def load_data(file_path, sheet_name):
    """Load Customer Demographics Data from the Excel file."""
    return pd.read_excel(file_path, sheet_name=sheet_name)

In [3]:
file_path = 'Raw_data.xlsx'
sheet_name = 'CustomerDemographic'
customer_data = load_data(file_path, sheet_name)

# 3.0 Initial Data Exploration

Displays basic information about the dataset, including:

■ Number of rows

■  Column names and data types

■  Number of missing values in each column

■  Summary statistics for numerical columns

In [4]:
def display_basic_info(dataframe):
    """Display first 5 records and information about columns and data-types."""
    print("Initial data overview:")
    print(dataframe.head(5))
    print(dataframe.info())
    print(dataframe.describe())

In [5]:
display_basic_info(customer_data)

Initial data overview:
   customer_id      first_name  last_name  gender  \
0            1         Laraine  Medendorp       F   
1            2             Eli    Bockman    Male   
2            3           Arlin     Dearle    Male   
3            4          Talbot        NaN    Male   
4            5  Sheila-kathryn     Calton  Female   

   past_3_years_bike_related_purchases                  DOB  \
0                                   93  1953-10-12 00:00:00   
1                                   81  1980-12-16 00:00:00   
2                                   61  1954-01-20 00:00:00   
3                                   33  1961-10-03 00:00:00   
4                                   56  1977-05-13 00:00:00   

                job_title job_industry_category     wealth_segment  \
0     Executive Secretary                Health      Mass Customer   
1  Administrative Officer    Financial Services      Mass Customer   
2      Recruiting Manager              Property      Mass Customer   

# 4.0 Data Cleaning

■ Drops the irrelevant 'default' column

■ Fills missing values in 'last_name' with 'None'

■ Removes records with missing Date of Birth (DOB)

■ Calculates 'Age' based on DOB

■ Fills missing job-related information with 'Missing'

■ Handles inconsistencies in the 'gender' column (e.g., 'M' to 'Male', 'F' to 'Female')

In [6]:
# Drop Irrelevant Column
def drop_irrelevant_column(dataframe, column_name):
    """Drop irrelevant column from the dataset."""
    if column_name in dataframe.columns:
        dataframe.drop(columns=[column_name], inplace=True)
        print(f"Column '{column_name}' dropped successfully.")
    else:
        print(f"Column '{column_name}' not found in the DataFrame.")

In [7]:
drop_irrelevant_column(customer_data, 'default')

Column 'default' dropped successfully.


In [8]:
#Missing Values Check
def missing_values_check(dataframe):
    """Check for missing values in the dataset."""
    print("Total number of missing values:")
    print(dataframe.isnull().sum())
    print("\nPercentage of missing values:")
    print(dataframe.isnull().mean() * 100)

In [9]:
# Missing values check
missing_values_check(customer_data)

Total number of missing values:
customer_id                              0
first_name                               0
last_name                              125
gender                                   0
past_3_years_bike_related_purchases      0
DOB                                     87
job_title                              506
job_industry_category                  656
wealth_segment                           0
deceased_indicator                       0
owns_car                                 0
tenure                                  87
dtype: int64

Percentage of missing values:
customer_id                             0.000
first_name                              0.000
last_name                               3.125
gender                                  0.000
past_3_years_bike_related_purchases     0.000
DOB                                     2.175
job_title                              12.650
job_industry_category                  16.400
wealth_segment                          

In [10]:
# Fill Missing Last Name
def fill_missing_last_name(dataframe):
    """Fill missing last names with 'None'."""
    dataframe['last_name'].fillna('None', inplace=True)

In [11]:
fill_missing_last_name(customer_data)

In [12]:
# Remove Records with Missing Date of Birth
def remove_records_with_missing_dob(dataframe):
    """Remove records with missing Date of Birth."""
    dataframe.dropna(subset=['DOB'], inplace=True)

In [13]:
remove_records_with_missing_dob(customer_data)

In [14]:
#  Calculate Age
def calculate_age(dataframe):
    """Create 'Age' column based on the Date of Birth."""
    today = pd.to_datetime(date.today())
    dataframe['DOB'] = pd.to_datetime(dataframe['DOB'], errors='coerce')
    age_in_days = (today - dataframe['DOB']).dt.days
    age_in_years = np.floor_divide(age_in_days, 365)
    dataframe['Age'] = age_in_years

In [15]:
calculate_age(customer_data)

In [16]:
# Fill Missing Job Information
def fill_missing_job_info(dataframe, column_name, fill_value='Missing'):
    """Fill missing job information with the specified fill value."""
    dataframe[column_name].fillna(fill_value, inplace=True)

In [17]:
fill_missing_job_info(customer_data, 'job_title')

In [18]:
fill_missing_job_info(customer_data, 'job_industry_category')

In [19]:
# Handle Gender Inconsistencies
def handle_gender_inconsistencies(dataframe):
    """Handle inconsistencies in the 'gender' column."""
    gender_mapping = {'M': 'Male', 'Male': 'Male', 'F': 'Female', 'Female': 'Female', 'Femal': 'Female'}
    dataframe['gender'] = dataframe['gender'].map(gender_mapping)

In [20]:
handle_gender_inconsistencies(customer_data)

# 5.0 Duplicate Checks:

■Removes duplicate records based on the 'customer_id' column

■ Checks for inconsistencies in other categorical columns ('wealth_segment', 'deceased_indicator', 'owns_car')

In [21]:
def drop_duplicates(dataframe, primary_key_column):
    """Remove duplicate records from the dataset."""
    dataframe_dedupped = dataframe.drop_duplicates(subset=dataframe.columns.difference([primary_key_column]))
    return dataframe_dedupped

In [22]:
# Duplicate checks
customer_data_deduped = drop_duplicates(customer_data, 'customer_id')
print("Number of records after removing duplicates: {}".format(customer_data_deduped.shape[0]))

Number of records after removing duplicates: 3913


In [23]:
# Check for inconsistencies in other categorical columns
print(customer_data['wealth_segment'].value_counts())

wealth_segment
Mass Customer        1954
High Net Worth        996
Affluent Customer     963
Name: count, dtype: int64


In [24]:
print(customer_data['deceased_indicator'].value_counts())

deceased_indicator
N    3911
Y       2
Name: count, dtype: int64


In [25]:
print(customer_data['owns_car'].value_counts())

owns_car
Yes    1974
No     1939
Name: count, dtype: int64


# 6.0 Export Cleaned Data:

Exports the cleaned dataset to a CSV file named 'Cleaned_Customer_Demographic_Dataset.csv'

In [26]:
def export_cleaned_data_to_csv(dataframe, file_name):
    """Export the cleaned dataset to a CSV file."""
    dataframe.to_csv(file_name, index=False)

In [27]:
# Export cleaned data to CSV
export_cleaned_data_to_csv(customer_data_deduped, 'Cleaned_Customer_Demographic_Dataset.csv')