In [None]:
# -------------------------------
# Internship Task 1: Data Cleaning
# Dataset: Customer Personality Analysis
# -------------------------------

# Step 1: Import the library
import pandas as pd

# Step 2: Load the dataset
# Note: this dataset is tab-separated, so we add sep="\t"
df = pd.read_csv("dataset/marketing_campaign.csv", sep="\t")

# Step 3: Look at the dataset
print("Rows and columns:", df.shape)     # number of rows and columns
print("\nFirst 5 rows:\n", df.head())    # see first few rows
print("\nColumn names:\n", df.columns.tolist())

# -------------------------------
# Step 4: Check for problems
# -------------------------------

# Check if any column has missing values
print("\nMissing values:\n", df.isnull().sum())

# Check if there are duplicate rows
print("\nNumber of duplicate rows:", df.duplicated().sum())

# Check the data types (numbers, text, dates, etc.)
print("\nData types:\n", df.dtypes)

# Look at unique values in Education and Marital_Status
print("\nUnique values in Education:", df['Education'].unique())
print("\nUnique values in Marital_Status:", df['Marital_Status'].unique())

# -------------------------------
# Step 5: Cleaning the dataset
# -------------------------------

# 1. Drop ID column (not useful for analysis)
df = df.drop(columns=['ID'])

# 2. Fill missing values in Income with median value
df['Income'] = df['Income'].fillna(df['Income'].median())

# 3. Remove duplicate rows (if any)
df = df.drop_duplicates()

# 4. Fix Education values (make them simple and uniform)
df['Education'] = df['Education'].replace({
    '2n Cycle': 'Undergraduate',
    'Graduation': 'Graduate',
    'PhD': 'Doctorate',
    'Master': 'Masters',
    'Basic': 'Basic'
})

# 5. Fix Marital_Status values (make them clean and uniform)
df['Marital_Status'] = df['Marital_Status'].replace({
    'Alone': 'Single',
    'Absurd': 'Single',
    'YOLO': 'Single'
})

# 6. Convert Dt_Customer column to date format
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True, errors='coerce')

# 7. Rename columns (make them lowercase and replace spaces with _ )
df.columns = [col.lower().replace(" ", "_") for col in df.columns]

# 8. Fix data types (make sure year_birth is int, income is float)
df['year_birth'] = df['year_birth'].astype(int)
df['income'] = df['income'].astype(float)

# -------------------------------
# Step 6: Check again after cleaning
# -------------------------------
print("\nMissing values now:\n", df.isnull().sum())
print("\nData types now:\n", df.dtypes)
print("\nEducation values after cleaning:", df['education'].unique())
print("\nMarital status values after cleaning:", df['marital_status'].unique())

# -------------------------------
# Step 7: Save the clean dataset
# -------------------------------
df.to_csv("cleaned_dataset.csv", index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_dataset.csv'")
