In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np # Often useful for data manipulation
import sys

# Add the 'src' folder to our path so we can import our functions
sys.path.append('src')

# Import our custom cleaning functions
from cleaning import fill_missing_median, drop_missing, normalize_data

# Load the raw dataset
# Make sure your dataset is in the 'data/raw/' folder
try:
    df_raw = pd.read_csv('data/raw/sample_data.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: Make sure 'sample_data.csv' is in the 'data/raw/' folder.")

# Display the first few rows to see what it looks like
print("Original Data Head:")
df_raw.head()

Dataset loaded successfully!
Original Data Head:


Unnamed: 0,id,name,value,is_active
0,1,Alice,10.5,True
1,2,Bob,20.2,False
2,3,Charlie,30.7,True
3,4,David,40.1,False


In [21]:
# --- Assumption 1: We decide to drop any column that is more than 50% empty. ---
# Such columns likely don't have enough data to be useful.
df_cleaned = drop_missing(df_raw, threshold=0.5)

# --- Assumption 2: For 'Age' and 'Salary', missing values can be reasonably ---
# --- represented by the median of the existing values. We choose median ---
# --- over mean to avoid being skewed by extreme outliers. ---
# Replace with your actual column names that have missing values
columns_to_fill = ['value']# <--- CHANGE THESE to your column names
df_cleaned = fill_missing_median(df_cleaned, columns=columns_to_fill)

# --- Assumption 3: 'Age' and 'Salary' should be on the same scale (0 to 1) ---
# --- for future machine learning models. This prevents one feature from ---
# --- dominating the other. ---
columns_to_normalize = ['value'] # <--- CHANGE THESE to your column names
df_cleaned = normalize_data(df_cleaned, columns=columns_to_normalize)

print("\nCleaning process complete!")

No columns exceeded the missing value threshold.
Filled missing values in columns: ['value'] using the median.
Normalized columns: ['value']

Cleaning process complete!


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned[col].fillna(median_val, inplace=True)


In [23]:
print("--- Original Data Description ---")
print(df_raw.describe())

print("\n--- Cleaned Data Description ---")
print(df_cleaned.describe())

print("\n--- Cleaned Data Head ---")
df_cleaned.head()

--- Original Data Description ---
             id      value
count  4.000000   4.000000
mean   2.500000  25.375000
std    1.290994  12.822214
min    1.000000  10.500000
25%    1.750000  17.775000
50%    2.500000  25.450000
75%    3.250000  33.050000
max    4.000000  40.100000

--- Cleaned Data Description ---
             id     value
count  4.000000  4.000000
mean   2.500000  0.502534
std    1.290994  0.433183
min    1.000000  0.000000
25%    1.750000  0.245777
50%    2.500000  0.505068
75%    3.250000  0.761824
max    4.000000  1.000000

--- Cleaned Data Head ---


Unnamed: 0,id,name,value,is_active
0,1,Alice,0.0,True
1,2,Bob,0.327703,False
2,3,Charlie,0.682432,True
3,4,David,1.0,False


In [25]:
# Define the output path
output_path = 'data/processed/cleaned_data.csv'

# Save the cleaned DataFrame to a new CSV file
df_cleaned.to_csv(output_path, index=False)

print(f"Cleaned dataset saved to: {output_path}")

Cleaned dataset saved to: data/processed/cleaned_data.csv
