# **Data Cleaning Project**
This project is done for the learning purpose. It will give u the practical understanding of data cleaning

## 1. Import Librabries & Load Data

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
# Load dataset
file_path = "../data/data.csv"
df = pd.read_csv(file_path)
df_raw = df.copy # Copy of the raw dataset

## 2. Initial Profiling & Inspection

In [17]:
# Number of rows and columns
print(f"{df.shape[0]} rows and {df.shape[1]} columns.")

90 rows and 8 columns.


In [16]:
# Get first few rows of data to get overview
df.head()

Unnamed: 0,OrderID,Date,CustomerName,Product,Quantity,Price,Total,Region
0,1001,1/5/2021,John Smith,Laptop,1.0,"$1,200",1200,North
1,1002,1/7/2021,Jane Doe,Mobile,2.0,300,600,South
2,1003,1/8/2021,Michael Brown,Laptop,,$1,200,
3,1004,2021-13-09,Sarah Miller,Tablet,3.0,$250,750,West
4,1005,1/10/2021,John Smith,Laptop,1.0,1200,1200,North


In [15]:
# Get types of data in each column
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   OrderID       90 non-null     int64 
 1   Date          87 non-null     object
 2   CustomerName  85 non-null     object
 3   Product       87 non-null     object
 4   Quantity      85 non-null     object
 5   Price         89 non-null     object
 6   Total         85 non-null     object
 7   Region        88 non-null     object
dtypes: int64(1), object(7)
memory usage: 5.8+ KB


In [14]:
# Get initial summary with basic statistics
df.describe(include="all")

Unnamed: 0,OrderID,Date,CustomerName,Product,Quantity,Price,Total,Region
count,90.0,87,85,87,85.0,89,85.0,88
unique,,86,10,3,12.0,7,16.0,7
top,,1/12/2021,Adam Lee,Laptop,1.0,"$1,200",300.0,West
freq,,2,17,32,36.0,22,23.0,32
mean,1045.5,,,,,,,
std,26.124701,,,,,,,
min,1001.0,,,,,,,
25%,1023.25,,,,,,,
50%,1045.5,,,,,,,
75%,1067.75,,,,,,,


In [21]:
# Get missing value summary
print("\nNumber of missing values in each column")
df.isnull().sum().sort_values(ascending=False)


Number of missing values in each column


CustomerName    5
Quantity        5
Total           5
Date            3
Product         3
Region          2
Price           1
OrderID         0
dtype: int64

In [27]:
# Get duplicated value summary
print(f"Number of duplicated rows is {df.duplicated().sum()}")

Number of duplicated rows is 0


## 3. Fixing Data Types

In [29]:
# Convert "Date" column data type into date type
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

In [44]:
# Fix inconsistent formatting of "Price" column
df["Price"] = df["Price"].str.replace("$", "").str.replace(",", "")

In [48]:
# Convert "Price, Quantity, Total" column into numerical datatype
lists = ["Price", "Quantity", "Total"]
for col in lists:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [53]:
# Convert "OrderID" column into object data type
df["OrderID"] = df["OrderID"].astype("object")

## 4. Text Normalization / Categorical Cleaning

In [60]:
for col in df.columns:
    print(df[col].value_counts())

OrderID
1001    1
1002    1
1003    1
1004    1
1005    1
       ..
1086    1
1087    1
1088    1
1089    1
1090    1
Name: count, Length: 90, dtype: int64
Date
2021-01-12    2
2021-01-07    1
2021-01-08    1
2021-01-10    1
2021-01-05    1
             ..
2021-04-02    1
2021-04-03    1
2021-04-04    1
2021-04-05    1
2021-04-06    1
Name: count, Length: 85, dtype: int64
CustomerName
Adam Lee           17
Michael Brown      17
Jane Doe           16
Sarah Miller       15
John Smith         15
  John Smith        1
  John  SMITH       1
Jane   Doe          1
John SMITH          1
JANE DOE            1
Name: count, dtype: int64
Product
Laptop    32
Mobile    30
Tablet    25
Name: count, dtype: int64
Quantity
 1.0      36
 2.0      19
 3.0      18
 5.0       2
 4.0       2
 100.0     1
 0.0       1
-3.0       1
 10.0      1
 200.0     1
Name: count, dtype: int64
Price
300.0     31
1200.0    27
250.0     25
1.0        6
Name: count, dtype: int64
Total
 300.0      23
 750.0      16
 1200.0 