In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [19]:
# Load the dataset (replace the path with the actual path of your dataset)
data = pd.read_csv("D:\\MainFlow\\Task2\\Global_Superstore2.csv", encoding='ISO-8859-1')

In [20]:
# Display the first few rows to understand the structure
print(data.head())

   Row ID         Order ID  Order Date   Ship Date     Ship Mode Customer ID  \
0   32298   CA-2012-124891  31-07-2012  31-07-2012      Same Day    RH-19495   
1   26341    IN-2013-77878  05-02-2013  07-02-2013  Second Class    JR-16210   
2   25330    IN-2013-71249  17-10-2013  18-10-2013   First Class    CR-12730   
3   13524  ES-2013-1579342  28-01-2013  30-01-2013   First Class    KM-16375   
4   47221     SG-2013-4320  05-11-2013  06-11-2013      Same Day     RH-9495   

      Customer Name      Segment           City            State  ...  \
0       Rick Hansen     Consumer  New York City         New York  ...   
1     Justin Ritter    Corporate     Wollongong  New South Wales  ...   
2      Craig Reiter     Consumer       Brisbane       Queensland  ...   
3  Katherine Murray  Home Office         Berlin           Berlin  ...   
4       Rick Hansen     Consumer          Dakar            Dakar  ...   

         Product ID    Category Sub-Category  \
0   TEC-AC-10003033  Technology 

In [21]:
# Select only the numeric columns for filling missing values
numeric_cols = data.select_dtypes(include=[np.number]).columns

# Fill missing values in numeric columns with the median of each column
data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())

# If you want to fill missing values for non-numeric columns, you can do it separately (e.g., with 'mode' or a placeholder)
non_numeric_cols = data.select_dtypes(exclude=[np.number]).columns
data[non_numeric_cols] = data[non_numeric_cols].fillna('Unknown')  # or use .mode() or any other strategy


In [22]:
# Remove duplicates
data.drop_duplicates(inplace=True)

In [23]:
# Handle outliers
## Using Z-score method to detect outliers
z_scores = np.abs(stats.zscore(data.select_dtypes(include=[np.number])))  # Only numeric columns
outliers = (z_scores > 3).all(axis=1)  # Define threshold (3 standard deviations)
data_clean = data[~outliers]

In [None]:
# Statistical Analysis
## Mean, Median, Standard Deviation, and Variance
print("Mean values:\n", data_clean.mean())
print("Median values:\n", data_clean.median())
print("Standard Deviation:\n", data_clean.std())
print("Variance:\n", data_clean.var())

In [None]:
## Correlation matrix
correlation_matrix = data_clean.corr()
print("Correlation Matrix:\n", correlation_matrix)

In [None]:
# Data Visualization
## Histograms for numerical columns
numerical_cols = data_clean.select_dtypes(include=[np.number]).columns
data_clean[numerical_cols].hist(bins=15, figsize=(15, 10))
plt.suptitle('Histograms of Numerical Columns')
plt.show()

In [None]:
## Boxplots for outlier detection
plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 3, i)  # Adjust the number of rows/columns based on your number of variables
    sns.boxplot(x=data_clean[col])
    plt.title(f'Boxplot for {col}')
plt.tight_layout()
plt.show()

In [None]:
## Heatmap for correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()