In [1]:
# Import panda and numpy

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Import dataset

In [4]:
df = pd.read_csv('customer_demographics_contaminated.csv')

In [5]:
# Count initial duplicates

In [6]:
initial_duplicate_count = df.duplicated().sum()
print(f'Initial duplicate count: {initial_duplicate_count}')

Initial duplicate count: 177


In [7]:
# Drop duplicate rows

In [8]:
df_cleaned = df.drop_duplicates()

In [9]:
# Verify duplicate count after dropping

In [10]:
duplicate_count_after_dropping = df_cleaned.duplicated().sum()
print(f'Duplicate count after dropping: {duplicate_count_after_dropping}')

Duplicate count after dropping: 0


In [11]:
# Count null values in each column before cleaning

In [12]:
null_counts_before = df_cleaned.isnull().sum()
print('Null values before cleaning:')
print(null_counts_before)

Null values before cleaning:
CustomerID       0
Age            276
Gender           0
Location         0
IncomeLevel    285
SignupDate       0
dtype: int64


In [13]:
# Drop rows where 'Age' is blank, NaN, or 'Unknown'

In [14]:
df_cleaned = df_cleaned.loc[~df_cleaned['Age'].isin(['Unknown', '']) & df_cleaned['Age'].notnull()]

In [15]:
# Count null values in 'Age' column after cleaning

In [16]:
null_age_count = df_cleaned['Age'].isnull().sum()
print(f'Null values in Age after cleaning: {null_age_count}')

Null values in Age after cleaning: 0


In [17]:
# Drop rows where 'IncomeLevel' is null

In [18]:
df_cleaned = df_cleaned.dropna(subset=['IncomeLevel'])

In [19]:
# Count null values in all columns after cleaning

In [20]:
null_counts_after = df_cleaned.isnull().sum()
print('Null values after cleaning:')
print(null_counts_after)

Null values after cleaning:
CustomerID     0
Age            0
Gender         0
Location       0
IncomeLevel    0
SignupDate     0
dtype: int64


In [21]:
# Check current data types of the DataFrame

In [22]:
print('Current data types:')
print(df_cleaned.dtypes)

Current data types:
CustomerID     object
Age            object
Gender         object
Location       object
IncomeLevel    object
SignupDate     object
dtype: object


In [23]:
# Convert data types

In [24]:
df_cleaned['CustomerID'] = df_cleaned['CustomerID'].astype(str)
df_cleaned['Age'] = pd.to_numeric(df_cleaned['Age'], errors='coerce')
df_cleaned['Gender'] = df_cleaned['Gender'].astype('category')
df_cleaned['Location'] = df_cleaned['Location'].astype('category')
df_cleaned['IncomeLevel'] = df_cleaned['IncomeLevel'].astype('category')
df_cleaned['SignupDate'] = pd.to_datetime(df_cleaned['SignupDate'], format='%d/%m/%Y', errors='coerce')

In [25]:
# Verify data types after conversion
print('Data types after conversion:')
print(df_cleaned.dtypes)

Data types after conversion:
CustomerID             object
Age                   float64
Gender               category
Location             category
IncomeLevel          category
SignupDate     datetime64[ns]
dtype: object


In [26]:
# Save the cleaned DataFrame to a new CSV file

In [27]:
df_cleaned.to_csv('customer_demographics_contaminated_cleaned.csv', index=False)