In [None]:
# Filling in the demo_notebook.ipynb with sample data and basic usage of DataProfiler

from nbformat import v4 as nbf

notebook = nbf.new_notebook()

cells = []

In [None]:
# 1. Introduction
cells.append(nbf.new_markdown_cell("# DataQualityChecker Demo\nThis notebook demonstrates how to use the DataQualityChecker library."))

In [None]:
# 2. Install necessary packages
cells.append(nbf.new_code_cell("!pip install pandas seaborn scikit-learn"))

In [None]:
# 3. Import libraries
cells.append(nbf.new_code_cell("""import pandas as pd
import seaborn as sns
from dataqualitychecker.profiler import DataProfiler
from dataqualitychecker.outliers import detect_outliers_iqr
from dataqualitychecker.leakage import check_data_leakage
"""))

In [None]:
# 4. Load dataset
cells.append(nbf.new_code_cell("""# Load sample Titanic dataset
df = sns.load_dataset('titanic')
df.head()"""))

In [None]:
# 5. DataProfiler usage
cells.append(nbf.new_code_cell("""# Initialize the profiler
profiler = DataProfiler(df)

# Check missing values
print("Missing Values:")
print(profiler.check_missing())

# Check duplicate rows
print("\\nDuplicate Rows:")
print(profiler.check_duplicates())

# Check data types
print("\\nData Types:")
print(profiler.check_types())"""))


In [None]:
# 6. Check variance and skewness
cells.append(nbf.new_code_cell("""# Variance and skewness (only for numeric columns)
print("\\nLow Variance Columns:")
print(profiler.check_variance())

print("\\nHighly Skewed Columns:")
print(profiler.check_skewness())"""))

In [None]:
# 7. Outlier detection example
cells.append(nbf.new_code_cell("""# Outlier detection using IQR method on 'age' column
outliers = detect_outliers_iqr(df.dropna(subset=['age']), 'age')
print("\\nDetected Outliers in Age Column:")
outliers[['age']]"""))

In [None]:
# 8. Correlation leakage detection
cells.append(nbf.new_code_cell("""# Correlation-based leakage detection (only numerical & clean data)
leakage_df = df.select_dtypes(include='number').dropna()
if 'survived' in leakage_df.columns:
    print("\\nPossible Leakage Columns:")
    print(check_data_leakage(leakage_df, target='survived'))
else:
    print("Target column 'survived' not found or not numeric.")"""))

In [None]:
import nbformat

# Create a new notebook using v4 format
nb = nbformat.v4.new_notebook()
nb['cells'] = cells

# Write the notebook to the file system
with open(demo_path, "w") as f:
    nbformat.write(nb, f)

demo_path