In [None]:
# code for data cleaning
import pandas as pd
import numpy as np

# --- Configuration ---
# Define the input CSV file name
input_csv_file = 'Global_Superstore2.xlsx - Sheet1.csv'
# Define the output CSV file name for the cleaned data
output_csv_file = 'Global_Superstore_Cleaned.csv'

# --- 1. Load the Dataset ---
print(f"Loading data from '{input_csv_file}'...")
try:
    df = pd.read_csv(input_csv_file)
    print("Data loaded successfully. Initial shape:", df.shape)
    print("\nFirst 5 rows of the dataset:")
    print(df.head())
    print("\nColumn information and data types:")
    df.info()
except FileNotFoundError:
    print(f"Error: The file '{input_csv_file}' was not found. Please ensure it's in the same directory.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the CSV: {e}")
    exit()

# Make a copy to preserve the original DataFrame if needed for comparison
df_cleaned = df.copy()

# --- 2. Handle Missing Values ---
print("\n--- Handling Missing Values ---")
print("Missing values before handling:")
print(df_cleaned.isnull().sum()[df_cleaned.isnull().sum() > 0])

# Separate columns by data type for specific handling
numerical_cols = df_cleaned.select_dtypes(include=np.number).columns
categorical_cols = df_cleaned.select_dtypes(include='object').columns

# Strategy 1: Impute numerical columns with the mean
# You could also use .median() for skewed data
for col in numerical_cols:
    if df_cleaned[col].isnull().any():
        mean_val = df_cleaned[col].mean()
        df_cleaned[col].fillna(mean_val, inplace=True)
        print(f"Filled missing values in numerical column '{col}' with mean: {mean_val:.2f}")

# Strategy 2: Impute categorical columns with a placeholder 'Unknown'
# Alternatively, you could use df_cleaned[col].mode()[0] to fill with the most frequent value
for col in categorical_cols:
    if df_cleaned[col].isnull().any():
        df_cleaned[col].fillna('Unknown', inplace=True)
        print(f"Filled missing values in categorical column '{col}' with 'Unknown'")

print("\nMissing values after handling:")
print(df_cleaned.isnull().sum()[df_cleaned.isnull().sum() > 0])
if df_cleaned.isnull().sum().sum() == 0:
    print("No missing values remaining.")

# --- 3. Remove Duplicates ---
print("\n--- Removing Duplicates ---")
initial_rows = df_cleaned.shape[0]
df_cleaned.drop_duplicates(inplace=True)
rows_after_duplicates = df_cleaned.shape[0]
duplicates_removed = initial_rows - rows_after_duplicates
print(f"Removed {duplicates_removed} duplicate rows.")
print(f"Dataset shape after removing duplicates: {df_cleaned.shape}")

# --- 4. Detect and Handle Outliers (using IQR method) ---
print("\n--- Detecting and Handling Outliers (IQR Method) ---")

# We will apply outlier handling only to numerical columns
outlier_columns = []
for col in numerical_cols:
    # Skip columns that might be IDs or have very few unique values if they don't represent quantities
    # For example, if 'Row ID' was numerical, we wouldn't treat it for outliers.
    # This example assumes all numerical columns are candidates for outlier detection.
    if df_cleaned[col].nunique() > 2: # Only consider columns with more than 2 unique values
        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Identify outliers
        outliers = df_cleaned[(df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)]

        if not outliers.empty:
            outlier_columns.append(col)
            print(f"Column '{col}': Found {len(outliers)} outliers.")
            print(f"  Lower Bound: {lower_bound:.2f}, Upper Bound: {upper_bound:.2f}")

            # Strategy for handling outliers: Capping/Flooring
            # Values below lower_bound are set to lower_bound
            # Values above upper_bound are set to upper_bound
            df_cleaned[col] = np.where(df_cleaned[col] < lower_bound, lower_bound, df_cleaned[col])
            df_cleaned[col] = np.where(df_cleaned[col] > upper_bound, upper_bound, df_cleaned[col])
            print(f"  Outliers in '{col}' have been capped/floored.")
        else:
            print(f"Column '{col}': No significant outliers detected.")

if not outlier_columns:
    print("No columns with significant outliers were found or handled.")
else:
    print(f"\nOutlier handling applied to columns: {', '.join(outlier_columns)}")


# --- 5. Save the Cleaned Dataset ---
print(f"\n--- Saving Cleaned Data ---")
try:
    df_cleaned.to_csv(output_csv_file, index=False)
    print(f"Cleaned data saved successfully to '{output_csv_file}'.")
    print(f"Final shape of the cleaned dataset: {df_cleaned.shape}")
except Exception as e:
    print(f"An error occurred while saving the cleaned CSV: {e}")

print("\n--- Data Cleaning Process Complete ---")


In [None]:
# code for stasticial analysis
import pandas as pd
import numpy as np

# --- Configuration ---
# Define the input CSV file name (this should be the output from the cleaning script)
input_cleaned_csv_file = 'Global_Superstore_Cleaned.csv'

# --- 1. Load the Cleaned Dataset ---
print(f"Loading cleaned data from '{input_cleaned_csv_file}' for statistical analysis...")
try:
    df_analyzed = pd.read_csv(input_cleaned_csv_file)
    print("Cleaned data loaded successfully. Shape:", df_analyzed.shape)
    print("\nFirst 5 rows of the cleaned dataset:")
    print(df_analyzed.head())
    print("\nColumn information and data types:")
    df_analyzed.info()
except FileNotFoundError:
    print(f"Error: The cleaned file '{input_cleaned_csv_file}' was not found. Please run the data cleaning script first.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the cleaned CSV: {e}")
    exit()

# Identify numerical columns for statistical analysis
numerical_cols_analysis = df_analyzed.select_dtypes(include=np.number).columns

# --- 2. Statistical Analysis ---
print("\n--- Statistical Analysis ---")

if not numerical_cols_analysis.empty:
    print("\nDescriptive Statistics for Numerical Columns:")
    # Compute mean, median, standard deviation, and variance
    descriptive_stats = df_analyzed[numerical_cols_analysis].agg(['mean', 'median', 'std', 'var'])
    print(descriptive_stats)

    print("\nCorrelation Matrix for Numerical Columns:")
    # Compute correlations between numerical variables
    correlation_matrix = df_analyzed[numerical_cols_analysis].corr()
    print(correlation_matrix)
else:
    print("No numerical columns found for statistical analysis in the cleaned dataset.")

print("\n--- Statistical Analysis Process Complete ---")


In [None]:
#Data Visualization:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# --- Configuration ---
# Define the input CSV file name (this should be the output from the cleaning script)
input_cleaned_csv_file = 'Global_Superstore_Cleaned.csv'

# --- 1. Load the Cleaned Dataset ---
print(f"Loading cleaned data from '{input_cleaned_csv_file}' for data visualization...")
try:
    df_visualized = pd.read_csv(input_cleaned_csv_file)
    print("Cleaned data loaded successfully. Shape:", df_visualized.shape)
    print("\nFirst 5 rows of the cleaned dataset:")
    print(df_visualized.head())
    print("\nColumn information and data types:")
    df_visualized.info()
except FileNotFoundError:
    print(f"Error: The cleaned file '{input_cleaned_csv_file}' was not found. Please run the data cleaning script first.")
    exit()
except Exception as e:
    print(f"An error occurred while loading the cleaned CSV: {e}")
    exit()

# Identify numerical columns for visualization
numerical_cols_visualization = df_visualized.select_dtypes(include=np.number).columns

# --- 2. Data Visualization ---
print("\n--- Generating Data Visualizations ---")

if not numerical_cols_visualization.empty:
    # --- 2.1. Histograms for Numerical Data Distribution ---
    print("\nGenerating Histograms for Numerical Data Distribution...")
    plt.figure(figsize=(15, 10))
    for i, col in enumerate(numerical_cols_visualization):
        plt.subplot(len(numerical_cols_visualization) // 3 + 1, 3, i + 1)
        sns.histplot(df_visualized[col], kde=True)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
    print("Histograms displayed.")

    # --- 2.2. Boxplots to Identify Outliers in Continuous Variables ---
    print("\nGenerating Boxplots to Identify Outliers...")
    plt.figure(figsize=(15, 10))
    for i, col in enumerate(numerical_cols_visualization):
        plt.subplot(len(numerical_cols_visualization) // 3 + 1, 3, i + 1)
        sns.boxplot(y=df_visualized[col])
        plt.title(f'Boxplot of {col}')
        plt.ylabel(col)
    plt.tight_layout()
    plt.show()
    print("Boxplots displayed.")

    # --- 2.3. Heatmap to Visualize Correlations ---
    print("\nGenerating Heatmap for Correlations...")
    correlation_matrix = df_visualized[numerical_cols_visualization].corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
    plt.title('Correlation Matrix Heatmap')
    plt.show()
    print("Correlation Heatmap displayed.")

else:
    print("No numerical columns found for data visualization.")

print("\n--- Data Visualization Process Complete ---")
