In [1]:
import pandas as pd

# Assuming you have loaded your dataset into df
df = pd.read_csv("Walmart_Cleaned.csv")

# Drop non-numeric or categorical columns (if any)
# 'Date' is dropped; 'Holiday_Flag' is optional (binary, may distort IQR)
numeric_df = df.drop(columns=["Date", "Holiday_Flag"])

# List of columns in your DataFrame (assuming all columns are numeric)
columns = numeric_df.columns

# Calculate IQR for each column
Q1 = numeric_df[columns].quantile(0.25)
Q3 = numeric_df[columns].quantile(0.75)
IQR = Q3 - Q1

filter_condition = ((numeric_df[columns] >= (Q1 - 1.5 * IQR)) & (numeric_df[columns] <= (Q3 + 1.5 * IQR)))

numeric_df_no_outliers = numeric_df[filter_condition.all(axis=1)]


print(f"Original shape: {numeric_df.shape}")
print(f"Shape after removing outliers: {numeric_df_no_outliers.shape}")

numeric_df_no_outliers.to_csv("Walmart_Cleaned_No_Outliers.csv", index=False)

# Identify the rows that were removed (outliers)
removed_rows = numeric_df[~filter_condition.all(axis=1)]

# Display some of the removed rows
print(f"Number of removed rows: {removed_rows.shape[0]}")
print(removed_rows.head())
# Get a summary of the removed rows
removed_rows_summary = removed_rows.describe()

# Print the summary of the removed rows
print("Summary statistics of the removed rows (outliers):")
print(removed_rows_summary)



Original shape: (6435, 6)
Shape after removing outliers: (5917, 6)
Number of removed rows: 518
     Store  Weekly_Sales  Temperature  Fuel_Price         CPI  Unemployment
189      2    3436007.68        49.97       2.886  211.064660         8.163
241      2    3224369.80        46.66       3.112  218.999550         7.441
471      4    2789469.45        48.08       2.752  126.669267         7.127
474      4    2740057.14        46.57       2.884  126.879484         7.127
475      4    3526713.39        43.21       2.887  126.983581         7.127
Summary statistics of the removed rows (outliers):
           Store  Weekly_Sales  Temperature  Fuel_Price         CPI  \
count  518.00000  5.180000e+02   518.000000  518.000000  518.000000   
mean    25.27027  1.134370e+06    63.295309    3.564948  132.229724   
std     11.48055  6.851813e+05    18.920199    0.416317   12.806478   
min      2.00000  3.039088e+05    -2.060000    2.752000  126.064000   
25%     12.00000  8.044205e+05    50.430000