### EDA Automation Notebook
This notebook is a jupyter notebook template for all my EDA projects. This will contain various checks that I commonly use in my projects. Since this is an automation or a template, this does not contain any test data. It will contain the following checks as seen below:
- Step 1: Loading the data
- Step 2: Checking the basic data information
- Step 3: Data quality assessment
- Step 4: Summary statistics
- Step 5: Correlations and outliers
- Step 6: Answering questions

There will be another python file and this prints out the pdf report instead of the notebook as this is used for data exploration only and not for answering questions. I used ChatGPT to help with the documentation of the code here as well as to identify the better and more efficient code to implement.

Will fix code so that all items will output on the txt/pdf file. For now, outputs are in the notebook.

In [9]:
#Import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import os
from IPython.display import display
import platform
import subprocess
import datetime
import textdistance
from PyPDF2 import PdfMerger
from fpdf import FPDF
import pandas as pd

##### Step 1: Loading the data

In [None]:
def load_data(file_path, sheet_name=None, encoding="utf-8"):
    if len(file_path) == 0:
        raise FileNotFoundError("File path cannot be blank.")
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"File not found: {file_path}")

    ext_type = os.path.splitext(file_path)[-1].lower()

    try:
        if ext_type == ".csv":
            try:
                df = pd.read_csv(file_path, encoding=encoding)
            except UnicodeDecodeError:
                print("UTF-8 failed, trying ISO-8859-1...")
                df = pd.read_csv(file_path, encoding="ISO-8859-1")
        elif ext_type in [".xls", ".xlsx"]:
            df = pd.read_excel(file_path, sheet_name=sheet_name if sheet_name else 0)
        else:
            raise ValueError("Unsupported file type. Only CSV and Excel files are supported.")
    except Exception as exc:
        raise RuntimeError(f"Error reading file: {exc}")

    print(f"Data loaded successfully from {file_path}")
    return df

In [6]:
#Interactive data loading for targeted file path loading. This allows to choose any file at any location.
if __name__ == "__main__":
    file_path = input("Enter file path (e.g., data.csv or C:/folder/data.csv): ").strip()
    sheet_name_input = input("Optional: Enter Excel sheet name (press ENTER if not applicable): ").strip()
    sheet_name = sheet_name_input if sheet_name_input else None

    try:
        df = load_data(file_path, sheet_name)
    except Exception as e:
        print(f"Error loading data from {file_path}")

UTF-8 failed, trying ISO-8859-1...
✅ Data loaded successfully from data.csv


##### Step 2: Checking the basic data information

In [13]:
def basic_data_information(df, preview_rows=5, file_path=""):
    """
    Generates a PDF summary of basic data information, including:
    - Shape and data types
    - Missing values
    - Descriptive statistics
    - Unique values per column
    - Data preview

    Saves the output as 'eda_automation_step2.pdf'.
    """
    pd.set_option('display.max_columns', 100)
    pd.set_option('display.max_rows', 100)
    pd.set_option('display.max_colwidth', None)

    numeric_cols = df.select_dtypes(include=['number']).shape[1]
    non_numeric_cols = df.select_dtypes(exclude=['number']).shape[1]

    # Accumulate all lines
    file_lines = []

    def out(line):
        file_lines.append(str(line))

    def out_df(sub_df):
        file_lines.append(sub_df.to_string())

    out("=" * 110)
    out("Basic Data Information:")
    out(f"There are {df.shape[0]:,} rows and {df.shape[1]:,} columns in the data.")
    out("Here are the data types per column:")
    out_df(df.dtypes.to_frame('Column data types'))

    out(f"\nThere are {numeric_cols} numeric columns and {non_numeric_cols} non-numeric columns.")

    out("\n" + "-" * 110 + "\nMissing Values:")
    mv = df.isnull().sum()
    mv = mv[mv > 0].sort_values(ascending=False)
    if not mv.empty:
        out_df(mv.to_frame('count_of_missing_values'))
    else:
        out("No missing values in the dataset.")

    out("\n" + "-" * 110 + "\nDescriptive Statistics:")
    out_df(df.describe(include='all').T)

    out("\n" + "-" * 110 + "\nUnique items per column:")
    c_uniques = df.nunique().sort_values(ascending=False)
    out_df(c_uniques.to_frame('unique_values'))

    if preview_rows:
        out(f"\nData preview: first {preview_rows} rows")
        out_df(df.head(preview_rows))

    out("\n" + "=" * 110)
    out(f"End of basic data summary for {file_path}")
    out("=" * 110)

    pdf = FPDF("L", "mm", "A4")
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.add_page()
    pdf.set_font("Courier", size=8)

    for line in file_lines:
        for subline in line.split("\n"):
            pdf.multi_cell(0, 5, subline)

    pdf_path = f"{file_path}_eda_automation_step2.pdf"
    pdf.output(pdf_path)
    print(f"✅ PDF saved as '{pdf_path}'")


In [14]:
basic_data_information(df, preview_rows=5, file_path=file_path)

✅ PDF saved as 'data.csv_eda_automation_step2.pdf'


##### Step 3: Data quality assessment

Data quality assessment offers an deep dive on the quality of the data. While some methods are similar with that on step 2, step 3 will have a deeper quality check to ensure that the data is 'ready' for further analysis.

Adding in a functionality that prints the output to a pdf instead of the console to review the outputs for large datasets.

In [None]:

# def data_quality_deep_dive(df, preview_rows=5, size_threshold=300, force_mode="container", file_prefix="quality_deep_dive_summary", file_path="data"):
#     mode = force_mode
#     output_lines = []

#     def add_output(line):
#         if mode == "print":
#             print(line)
#         else:
#             output_lines.append(str(line))
    
#     def add_df_output(sub_df, title):
#         if mode == "print":
#             print("\n" + "=" * 100 + f"\n{title}: \n")
#             print(sub_df)
#         else:
#             output_lines.append(f"\n\n{title}: \n")
#             output_lines.append(sub_df.to_string())

#     add_output("=" * 110)
#     add_output(f"DATA QUALITY ASSESSMENT FOR {file_path} with {df.shape[0]:,} rows x {df.shape[1]:,} columns")
#     add_output("=" * 110)

#     title_items = ['Duplicate Rows', 
#                 'Mixed Data Types', 
#                 'Capitalization Inconsistencies',
#                 'Unexpected/Unknown Values',
#                 'Numeric Outliers or Skew Checks',
#                 'Data Preview'
#                 ]

#     for index, title in enumerate(title_items, 1):
#         add_output("\n" + "-" * 110 + "\n" + f"[{index}] {title}:" + "\n" + "-" * 110 + "\n")
        
#         if index == 1:  # Duplicate Rows
#             duplicate_rows = df[df.duplicated()]
#             if not duplicate_rows.empty:
#                 add_output(f"There are {len(duplicate_rows):,} duplicate rows in the dataset.\n")
#                 duplicate_indices = duplicate_rows.index.tolist()
#                 add_output("Duplicate row indices (showing up to 100):")
#                 add_output(str(duplicate_indices[:100]))
#                 if len(duplicate_rows) <= size_threshold:
#                     add_output(duplicate_rows.to_string(index=False))
#                 else:
#                     add_output("Too many duplicates to display; showing top 5:")
#                     add_output(duplicate_rows.head(5).to_string(index=False))
#             else:
#                 add_output("No exact duplicate rows found.")

#         elif index == 2: #Mixed data types
#             mixed_data_types_issues = {}

#             for col in df.columns:
#                 mixed_types = df[col].dropna().map(type).value_counts()
#                 if len(mixed_types) > 1:
#                     mixed_data_types_issues[col] = mixed_types

#             if mixed_data_types_issues:
#                 add_output(f"Found {len(mixed_data_types_issues)} columns with mixed data types:\n")
#                 for col, types in mixed_data_types_issues.items():
#                     add_output(f"- Column '{col}' has multiple data types:")
#                     for t, count in types.items():
#                         add_output(f"    • {t.__name__}: {count:,} values")
#             else:
#                 add_output("No mixed data types found in all columns.")
                    
#         elif index == 3:  # Capitalization Inconsistencies
#             obj_cols = df.select_dtypes(include='object').columns
#             cap_issues_found = False
#             for col in obj_cols:
#                 vals = df[col].dropna().unique()
#                 groups = {}
#                 for v in vals:
#                     key = v.lower()
#                     groups.setdefault(key, set()).add(v)
#                 for key, variations in groups.items():
#                     if len(variations) > 1:
#                         cap_issues_found = True
#                         add_output(f"Column '{col}', has variations for '{key}': {variations}")
#             if not cap_issues_found:
#                 add_output("No capitalization inconsistencies found.")

#         elif index == 4:  # Unexpected/Unknown Values
#             known_placeholders = {'n/a', 'na', 'none', 'null', 'unknown', '-', '', 'not applicable'}
#             threshold = 0.01
            
#             rare_or_unknown_found = False
            
#             for col in df.select_dtypes(include='object'):
#                 val_counts = df[col].dropna().value_counts(normalize=True)
#                 to_flag = val_counts[val_counts < threshold].index.tolist()
                
#                 placeholder_hits = [v for v in df[col].dropna().unique()
#                                     if str(v).strip().lower() in known_placeholders]
                
#                 if to_flag or placeholder_hits:
#                     rare_or_unknown_found = True
#                     add_output(f"\nColumn: '{col}'")
#                     if placeholder_hits:
#                         add_output(f" - Found placeholder-like values: {placeholder_hits}")
#                     if to_flag:
#                         add_output(f" - Rare values (<1% occurrence): {to_flag[:5]}{'...' if len(to_flag) > 5 else ''}")
            
#             if not rare_or_unknown_found:
#                 add_output("No values flagged.")

#         elif index == 5:  # Numeric Outliers or Skew Checks
#             numeric_cols = df.select_dtypes(include='number').columns
#             if len(numeric_cols) == 0:
#                 add_output("No numeric columns found.")
#             else:
#                 for col in numeric_cols:
#                     series = df[col].dropna()
#                     if len(series) == 0:
#                         continue

#                     q1 = series.quantile(0.25)
#                     q3 = series.quantile(0.75)
#                     iqr = q3 - q1
#                     lower = q1 - 1.5 * iqr
#                     upper = q3 + 1.5 * iqr
#                     outliers = series[(series < lower) | (series > upper)]

#                     add_output(f"\nColumn: {col}")
#                     add_output(f" - Min: {series.min():,.2f}, Max: {series.max():,.2f}, Mean: {series.mean():,.2f}, Std: {series.std():,.2f}")
#                     add_output(f" - Outliers (IQR rule): {len(outliers):,}")

#                     if len(outliers) > 0 and len(outliers) <= size_threshold:
#                         add_output(f"Sample outlier values: {outliers.unique()[:5].tolist()}")
#                     elif len(outliers) > size_threshold:
#                         add_output("Too many outliers to display.")

#         elif index == 6:  # Data Preview
#             add_output(f"Previewing the first {preview_rows} rows of the dataset:\n")
#             if preview_rows > 0:
#                 preview = df.head(preview_rows).to_string(index=False)
#                 add_output(preview)
#             else:
#                 add_output("Preview  skipped because preview_rows is set to 0.")

#     return output_lines


In [None]:
# output_lines_cdd = data_quality_deep_dive(df, preview_rows=5, size_threshold=300, force_mode="file", file_path="my_data")

##### Step 4: Summary statistics

##### Step 5: Correlations and outliers

##### Step 6: Creation of report output

In [None]:
def merge_and_delete_eda_pdfs(output_filename="eda_report.pdf"):
    """
    Merges PDF files named 'eda_automation_step1.pdf' through
    'eda_automation_step4.pdf' into a single PDF, and then deletes
    the source PDF files.

    Args:
        output_filename (str): The desired name for the merged PDF file.
                               Defaults to "eda_report.pdf".
    """
    merger = PdfMerger()
    pdf_files = [
        "eda_automation_step1.pdf",
        "eda_automation_step2.pdf",
        "eda_automation_step3.pdf",
        "eda_automation_step4.pdf"
    ]

    # List to keep track of files that were actually appended (to delete only those)
    appended_files = []

    print("Attempting to merge the following PDF files:")
    for pdf_file in pdf_files:
        if os.path.exists(pdf_file):
            print(f"- {pdf_file}")
            merger.append(pdf_file)
            appended_files.append(pdf_file)
        else:
            print(f"Warning: '{pdf_file}' not found. Skipping.")

    try:
        if appended_files: # Only write if there's content to merge
            with open(output_filename, "wb") as output_file:
                merger.write(output_file)
            print(f"\nSuccessfully merged PDFs into '{output_filename}'")

            # Delete the source PDF files
            print("Attempting to delete source PDF files:")
            for pdf_file in appended_files:
                try:
                    os.remove(pdf_file)
                    print(f"Successfully deleted '{pdf_file}'")
                except OSError as e:
                    print(f"Error deleting '{pdf_file}': {e}")
        else:
            print("No PDF files were found to merge. No output file created.")

    except Exception as e:
        print(f"\nAn error occurred during the merging process: {e}")
    finally:
        merger.close()

if __name__ == "__main__":
    merge_and_delete_eda_pdfs()