Cleaning the CSV file

In [4]:
import numpy as np
import pandas as pd

# import the csv file
data = pd.read_csv('StoreUS-2015.csv')
#Drop rows with any missing values
data_cleaned = data.dropna()

#cleaning the missing value
data_cleaned = data.fillna({
    'Discount': np.nanmean(data['Discount']),
    'Unit Price': np.nanmean(data['Unit Price']),
    'Shipping Cost': np.nanmean(data['Shipping Cost']),
    'Customer Name': data['Customer Name'].mode()[0],
})

data_cleaned['Order Date'] = pd.to_datetime(data_cleaned['Order Date'], errors='coerce')
data_cleaned['Ship Date'] = pd.to_datetime(data_cleaned['Ship Date'], errors='coerce')

#Remove duplicate rows
data_cleaned = data_cleaned.drop_duplicates()

# Drop unnecessary columns
data_cleaned = data_cleaned.drop(columns=['Row ID', 'Postal Code', 'Order ID'])  

# For now, we can use a simple approach to handle extreme outliers in the 'Sales' and 'Profit' columns
sales_mean = np.mean(data_cleaned['Sales'])
sales_std = np.std(data_cleaned['Sales'])

profit_mean = np.mean(data_cleaned['Profit'])
profit_std = np.std(data_cleaned['Profit'])

data_cleaned = data_cleaned[(np.abs(data_cleaned['Sales'] - sales_mean) <= (3 * sales_std))]
data_cleaned = data_cleaned[(np.abs(data_cleaned['Profit'] - profit_mean) <= (3 * profit_std))]

data_cleaned.head()


Unnamed: 0,Order Priority,Discount,Unit Price,Shipping Cost,Customer ID,Customer Name,Ship Mode,Customer Segment,Product Category,Product Sub-Category,...,Product Base Margin,Country,Region,State or Province,City,Order Date,Ship Date,Profit,Quantity ordered new,Sales
0,High,0.01,2.84,0.93,3,Bonnie Potter,Express Air,Corporate,Office Supplies,Pens & Art Supplies,...,0.54,United States,West,Washington,Anacortes,2015-01-07,2015-01-08,4.56,4,13.01
2,Critical,0.06,9.48,7.29,11,Marcus Dunlap,Regular Air,Home Office,Furniture,Office Furnishings,...,0.45,United States,East,New Jersey,Roselle,2015-02-15,2015-02-17,-53.8096,22,211.15
3,Medium,0.09,78.69,19.99,14,Gwendolyn F Tyson,Regular Air,Small Business,Furniture,Office Furnishings,...,0.43,United States,Central,Minnesota,Prior Lake,2015-05-12,2015-05-14,803.4705,16,1164.45
4,Medium,0.08,3.28,2.31,14,Gwendolyn F Tyson,Regular Air,Small Business,Office Supplies,Pens & Art Supplies,...,0.56,United States,Central,Minnesota,Prior Lake,2015-05-12,2015-05-13,-24.03,7,22.23
5,Medium,0.05,3.28,4.2,14,Gwendolyn F Tyson,Regular Air,Small Business,Office Supplies,Pens & Art Supplies,...,0.56,United States,Central,Minnesota,Prior Lake,2015-05-12,2015-05-13,-37.03,4,13.99
