In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

## Data Cleaning:

In [2]:
# Two small datasets with some missing values and messy entries

customers = pd.DataFrame({
    'CustomerID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', "Basi", 'David', None], 
    'Age': [25, np.nan, 30, 22, 28]
})

orders = pd.DataFrame({
    'OrderID': [101, 102, 103, 104],
    'CustomerID': [1, 2, 3,None],
    'Amount': ["200","350",None,"400"]
})

print("\nOriginal Customers Data: ")

print(orders)
print(customers)



Original Customers Data: 
   OrderID  CustomerID Amount
0      101         1.0    200
1      102         2.0    350
2      103         3.0   None
3      104         NaN    400
   CustomerID   Name   Age
0           1  Alice  25.0
1           2    Bob   NaN
2           3   Basi  30.0
3           4  David  22.0
4           5   None  28.0


In [3]:
# Cleaning Steps:

# Removing null values

customers_clean = customers.dropna(subset=['Name'])  # Drop rows where Name is missing

customers_clean 

Unnamed: 0,CustomerID,Name,Age
0,1,Alice,25.0
1,2,Bob,
2,3,Basi,30.0
3,4,David,22.0


In [4]:
# Filling missing Age with mean age
customers_clean['Age'] = customers_clean['Age'].fillna(customers_clean['Age'].mean())  # Fill missing Age with mean age
customers_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  customers_clean['Age'] = customers_clean['Age'].fillna(customers_clean['Age'].mean())  # Fill missing Age with mean age


Unnamed: 0,CustomerID,Name,Age
0,1,Alice,25.0
1,2,Bob,25.666667
2,3,Basi,30.0
3,4,David,22.0


In [5]:
# Convert amoutn  (string)  to numeric and fill missing with 0

orders["amount"]= pd.to_numeric(orders["Amount"], errors='coerce').fillna(0)

print("\nCleaned Orders Data: ")
print(orders)


Cleaned Orders Data: 
   OrderID  CustomerID Amount  amount
0      101         1.0    200   200.0
1      102         2.0    350   350.0
2      103         3.0   None     0.0
3      104         NaN    400   400.0


In [6]:
# Data Integration [Merging Mltiple Sources

merged_data = pd.merge(customers_clean, orders, on='CustomerID', how='left')

print("\nMerged Data: ")
print(merged_data)


Merged Data: 
   CustomerID   Name        Age  OrderID Amount  amount
0           1  Alice  25.000000    101.0    200   200.0
1           2    Bob  25.666667    102.0    350   350.0
2           3   Basi  30.000000    103.0   None     0.0
3           4  David  22.000000      NaN    NaN     NaN


In [7]:
# Data Transformation:

#1. Create a new feature amount in thousands 

merged_data["amount_k"] = merged_data["amount"]/1000
merged_data

Unnamed: 0,CustomerID,Name,Age,OrderID,Amount,amount,amount_k
0,1,Alice,25.0,101.0,200.0,200.0,0.2
1,2,Bob,25.666667,102.0,350.0,350.0,0.35
2,3,Basi,30.0,103.0,,0.0,0.0
3,4,David,22.0,,,,


In [8]:
# Normalize age(0 to 1)

merged_data["age_normalized"] = ((merged_data["Age"] - merged_data["Age"].min())/(merged_data["Age"].max() - merged_data["Age"].min()))

merged_data

Unnamed: 0,CustomerID,Name,Age,OrderID,Amount,age_normalized
0,1,Alice,25.0,101.0,200.0,0.375
1,2,Bob,25.666667,102.0,350.0,0.458333
2,3,Basi,30.0,103.0,,1.0
3,4,David,22.0,,,0.0


In [8]:
# Categorize age groups

merged_data["age_group"] = pd.cut(merged_data["Age"], bins=[0,27, 31], labels=["Young", "Adult"])
merged_data

Unnamed: 0,CustomerID,Name,Age,OrderID,Amount,amount,amount_k,age_group
0,1,Alice,25.0,101.0,200.0,200.0,0.2,Young
1,2,Bob,25.666667,102.0,350.0,350.0,0.35,Young
2,3,Basi,30.0,103.0,,0.0,0.0,Adult
3,4,David,22.0,,,,,Young
