In [1]:
import pandas as pd

In [2]:
existing_customers = pd.read_csv('../data/raw/unclean_customers.csv')

existing_customers['age'] = existing_customers['age'].astype('Int64')

In [None]:
# Examine

existing_customers.info()

In [None]:
# Clean

# count duplicates

print(existing_customers.duplicated().sum())

existing_customers.drop_duplicates(inplace=True)

print(existing_customers.duplicated().sum())
print(existing_customers.info())

# sort on customer_id

existing_customers.sort_values(by='customer_id', inplace=True)

## remove second of any duplicate customer_ids

existing_customers.drop_duplicates(subset='customer_id', keep='first', inplace=True)

print(existing_customers.info())


# write intermediate data to csv

# existing_customers.to_csv('../data/processed/interim_customers.csv', index=False)

In [None]:
# Randomly remove 2.86% of rows

existing_customers = existing_customers.sample(frac=0.9714)

# Now we have how many rows?

print(existing_customers.info())

In [None]:
# To make the data 5200 rows again we need to duplicate 149 rows

existing_customers = pd.concat([existing_customers, existing_customers.sample(n=149)])

print(existing_customers.info())

In [7]:
# Save interim

existing_customers.to_csv('../data/processed/interim_customers_with_duplicates.csv', index=False)

# Save final

existing_customers.to_csv('../data/raw/unclean_customers.csv', index=False)

In [None]:
# Check for duplicates

print(existing_customers.duplicated().sum())

print(existing_customers.duplicated(subset='customer_id').sum())

In [None]:
# check for nulls in customer_id

print(existing_customers['customer_id'].isnull().sum())

In [None]:
# check for nulls in age

print(existing_customers['age'].isnull().sum())

In [None]:
# check for missing country

print(existing_customers['country'].isnull().sum())

In [None]:
print(existing_customers['is_active'].isnull().sum())

In [None]:
# Find duplicates
duplicates = existing_customers[existing_customers.duplicated(subset='customer_id', keep='first')]

# Find duplicates with missing country or missing is_active
missing_country_or_is_active = duplicates[duplicates['country'].isnull() | duplicates['is_active'].isnull()]

# Print the count
print(missing_country_or_is_active.shape[0])

In [3]:
# Remove rows with missing country or missing is_active
existing_customers = existing_customers.dropna(subset=['country', 'is_active'])

# Print the count of remaining rows
print(existing_customers.shape[0])


4594


In [4]:
# Transform the countries to uppercase

existing_customers['country'] = existing_customers['country'].str.upper()

In [5]:
# Transform the is_active column to boolean

def standardise_is_active(is_active_value: str) -> bool:
    if str(is_active_value.lower()) == 'active' or str(is_active_value) == '1':
        return True
    else:
        return False

existing_customers.loc[:, 'is_active'] = existing_customers['is_active'].apply(
    standardise_is_active
)

In [6]:
# Remove duplicates
existing_customers = existing_customers.drop_duplicates()

print(existing_customers.shape[0])

4457


In [15]:
# Save cleaned data as expected_test_results.csv

existing_customers.to_csv('../tests/test_data/expected_customer_clean_results.csv', index=False)