In [2]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import PowerTransformer

# -----------------------------
# 1) Load dataset
# -----------------------------
df = pd.read_csv("D:/datasets/dpp/retail_sales_week4.csv")

cols = ["annual_income", "monthly_spend", "avg_basket_value", "marketing_spend"]

print("Shape:", df.shape)
print("\nMissing %:")
print((df[cols].isna().mean() * 100).round(2))

Shape: (1000, 16)

Missing %:
annual_income       5.9
monthly_spend       0.0
avg_basket_value    0.0
marketing_spend     3.0
dtype: float64


In [3]:
# -----------------------------
# 2) Quick skewness + mean/median check (raw)
# -----------------------------
summary = pd.DataFrame({
    "skew_raw": df[cols].skew(numeric_only=True),
    "mean_raw": df[cols].mean(numeric_only=True),
    "median_raw": df[cols].median(numeric_only=True),
}).round(3)

print("\nRaw summary (skew / mean / median):")
print(summary)


Raw summary (skew / mean / median):
                  skew_raw    mean_raw  median_raw
annual_income        8.777  794433.333  601706.220
monthly_spend       10.098    7290.160    5127.585
avg_basket_value    14.950    2355.347    1425.635
marketing_spend      7.341    3013.435    1470.815


In [6]:
# -----------------------------
# 3) Data cleaning BEFORE log
#    - annual_income: 0 treated as missing (common missing-coding)
#    - marketing_spend: negative treated as missing (data entry error)
#    - impute missing with median (simple + robust)
# -----------------------------
df_clean = df.copy()

df_clean.loc[df_clean["annual_income"] <= 0, "annual_income"] = np.nan
df_clean.loc[df_clean["marketing_spend"] < 0, "marketing_spend"] = np.nan

for c in cols:
    df_clean[c] = df_clean[c].fillna(df_clean[c].median())

print("\nMissing %:")
print((df_clean[cols].isna().mean() * 100).round(2))


Missing %:
annual_income       0.0
monthly_spend       0.0
avg_basket_value    0.0
marketing_spend     0.0
dtype: float64


In [8]:
# -----------------------------
# program 4-2-1
# 4) Apply log transform safely (positive-only)
#    Using log1p to be safe with small values
# -----------------------------
df_log = df_clean.copy()
for c in cols:
    # all should be >0 after cleaning; log1p is still safer
    df_log[c] = np.log1p(df_log[c])


# -----------------------------
# 2) Quick skewness + mean/median check (raw)
# -----------------------------
summary = pd.DataFrame({
    "skew_raw": df_clean[cols].skew(numeric_only=True),
    "mean_raw": df_clean[cols].mean(numeric_only=True),
    "median_raw": df_clean[cols].median(numeric_only=True),
}).round(3)

print("\nRaw summary (skew / mean / median):")
print(summary)


summary = pd.DataFrame({
    "skew_raw": df_log[cols].skew(numeric_only=True),
    "mean_raw": df_log[cols].mean(numeric_only=True),
    "median_raw": df_log[cols].median(numeric_only=True),
}).round(3)

print("\n Log Tranformed summary (skew / mean / median):")
print(summary)



Raw summary (skew / mean / median):
                  skew_raw    mean_raw  median_raw
annual_income        9.082  786261.452  604682.580
monthly_spend       10.098    7290.160    5127.585
avg_basket_value    14.950    2355.347    1425.635
marketing_spend      7.570    2999.704    1490.025

 Log Tranformed summary (skew / mean / median):
                  skew_raw  mean_raw  median_raw
annual_income        0.667    13.303      13.312
monthly_spend        0.273     8.531       8.543
avg_basket_value     0.235     7.273       7.263
marketing_spend      0.174     7.350       7.307


In [None]:
#program 4-2-2
