In [1]:

import pandas as pd
import numpy as np

def perform_data_quality_assessment(df):
    print("\n--- Data Quality Assessment ---")
    
    # Check for missing values
    print("\nMissing values per column:")
    print(df.isnull().sum())

    # Check for duplicate rows
    print("\nNumber of duplicate rows:", df.duplicated().sum())

    # Check data types
    print("\nData types:")
    print(df.dtypes)

    # Identify outliers (simple example for 'amount' using IQR)
    Q1 = df["amount"].quantile(0.25)
    Q3 = df["amount"].quantile(0.75)
    IQR = Q3 - Q1
    outlier_threshold_upper = Q3 + 1.5 * IQR
    outlier_threshold_lower = Q1 - 1.5 * IQR
    outliers = df[(df["amount"] < outlier_threshold_lower) | (df["amount"] > outlier_threshold_upper)]
    print(f"\nNumber of outliers in 'amount' column (using IQR): {len(outliers)}")
    print("Outlier examples (first 5 rows):\n", outliers.head())

    # Calculate descriptive statistics
    print("\n--- Descriptive Statistics ---")
    print(df.describe(include='all'))

    # Analyze data distributions (for numerical columns)
    print("\n--- Data Distributions (Numerical Columns) ---")
    for col in df.select_dtypes(include=np.number).columns:
        print(f"\nDistribution for {col}:")
        print(df[col].value_counts(bins=10, sort=False))

    # Analyze data distributions (for categorical columns)
    print("\n--- Data Distributions (Categorical Columns) ---")
    for col in df.select_dtypes(include='object').columns:
        print(f"\nDistribution for {col}:")
        print(df[col].value_counts())

    file_path = "C:\\Users\\ajroy\\OneDrive\\Desktop\\credit\\creditcard_2023.csv"
    df = pd.read_csv(file_path)
    perform_data_quality_assessment(df)


