# Client Data Cleaning â€” Template with Visualizations
This notebook is a client-ready, reproducible pipeline that:
- Loads raw data
- Profiles & documents issues
- Cleans data (types, missing, duplicates, outliers)
- Visualizes key checks (missingness, distributions, outliers)
- Exports cleaned data

---


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
RAW_PATH = 'data/messy_customers.csv'
df_raw = pd.read_csv(RAW_PATH)
df = df_raw.copy()
df.head()

## 1. Initial profiling

In [None]:
print('Rows,Cols:', df.shape)
print('\nData types:')
print(df.dtypes)

print('\nMissing values per column:')
print(df.isna().sum())

## 2. Visualize missingness and distributions
We use matplotlib to show missing counts and distributions. (No seaborn.)

In [None]:
# Missingness bar chart
missing = df.isna().sum().sort_values(ascending=False)
plt.figure(figsize=(8,4))
missing.plot.bar()
plt.title('Missing values per column')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Distribution of amount_spent (raw) - convert to numeric first (coerce errors)
df['amount_spent_raw'] = df['amount_spent']
df['amount_spent'] = df['amount_spent'].replace('[\$,]', '', regex=True)
df['amount_spent'] = pd.to_numeric(df['amount_spent'], errors='coerce')
plt.figure(figsize=(8,4))
plt.hist(df['amount_spent'].dropna(), bins=10)
plt.title('Distribution of amount_spent (raw)')
plt.xlabel('amount_spent')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

## 3. Cleaning steps (types, text, dates, missing, duplicates)

In [None]:
# Trim and normalize text columns
text_cols = df.select_dtypes(include='object').columns.tolist()
for col in text_cols:
    df[col] = df[col].astype(str).str.strip()

# Normalize country to uppercase and common variants
if 'country' in df.columns:
    df['country'] = df['country'].str.upper().replace({
        'UNITED STATES':'US','U.S.':'US','U.S.A.':'US','USA':'US','BD':'BD','BD.':'BD'
    })

# Parse dates
if 'signup_date' in df.columns:
    df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')

# Fix amounts already done above - keep numeric
# Age: coerce to numeric and handle invalid ages
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df.loc[(df['age'] < 0) | (df['age'] > 120), 'age'] = np.nan

# Email: simple validity flag (contains @ and .)
df['email_valid'] = df['email'].str.contains('@') & df['email'].str.contains('\.')

# Create flags for missingness before imputation
for col in ['email','age','amount_spent','signup_date']:
    if col in df.columns:
        df[col + '_was_missing'] = df[col].isna()

df.head()

## 4. Visual checks after initial cleaning

In [None]:
# Missingness after parsing
missing_after = df.isna().sum().sort_values(ascending=False)
plt.figure(figsize=(8,4))
missing_after.plot.bar()
plt.title('Missing values per column (after parsing)')
plt.tight_layout()
plt.show()

In [None]:
# Boxplot for amount_spent to detect outliers
plt.figure(figsize=(6,4))
plt.boxplot(df['amount_spent'].dropna(), vert=False)
plt.title('Boxplot of amount_spent (after parsing)')
plt.xlabel('amount_spent')
plt.tight_layout()
plt.show()

## 5. Handling missing values
Numeric imputation with median; categorical imputation with 'Unknown'.

In [None]:
# Numeric median imputation for amount_spent, age
for col in ['amount_spent','age']:
    if col in df.columns:
        median = df[col].median()
        df[col] = df[col].fillna(median)

# Categorical imputation
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].replace({'nan': np.nan, 'None': np.nan})
    df[col] = df[col].fillna('Unknown')

# Recompute email_valid
df['email_valid'] = df['email'].str.contains('@') & df['email'].str.contains('\.')
df.head()

## 6. Duplicates handling
We will deduplicate by customer_id keeping the most recent signup_date.

In [None]:
# Keep most recent per customer_id (if id missing, keep rows as is)
if 'customer_id' in df.columns and 'signup_date' in df.columns:
    df = df.sort_values('signup_date').drop_duplicates(subset='customer_id', keep='last')

df.shape

## 7. Final checks & Export

In [None]:
print('Final shape:', df.shape)
print('\nMissing values:')
print(df.isna().sum())

# Save cleaned dataset
OUTPUT = 'data/cleaned_customers.csv'
df.to_csv(OUTPUT, index=False)
print('\nSaved cleaned data to', OUTPUT)