In [50]:

import pandas as pd


In [52]:
import os
os.getcwd()


'C:\\Users\\cex'

In [64]:
import pandas as pd
import glob

# Find all CSV files
csv_files = glob.glob("Downloads/1-project/*.csv")


print("Number of CSV files found:", len(csv_files))
for f in csv_files:
    print(f)


Number of CSV files found: 6
Downloads/1-project\Brasilia_Air_Quality.csv
Downloads/1-project\Cairo_Air_Quality.csv
Downloads/1-project\Dubai_Air_Quality.csv
Downloads/1-project\London_Air_Quality.csv
Downloads/1-project\New_York_Air_Quality.csv
Downloads/1-project\Sydney_Air_Quality.csv


In [66]:
# Path where all CSVs are stored
csv_files = glob.glob("Downloads/1-project/*.csv")

dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    
    # Extract city name from filename
    city_name = os.path.basename(file).replace("_Air_Quality.csv", "")
    df["city"] = city_name
    
    dfs.append(df)

baseline_df = pd.concat(dfs, ignore_index=True)

baseline_df.head()


Unnamed: 0.1,Unnamed: 0,Date,City,CO,NO2,SO2,O3,PM2.5,PM10,AQI,city
0,0,2025-01-01 00:00:00+00:00,Brasilia,325.0,21.1,2.5,35.0,15.4,15.6,20.483337,Brasilia
1,1,2025-01-01 01:00:00+00:00,Brasilia,369.0,20.8,2.7,35.0,15.1,15.3,20.425,Brasilia
2,2,2025-01-01 02:00:00+00:00,Brasilia,419.0,20.4,3.0,34.0,15.6,15.8,20.333332,Brasilia
3,3,2025-01-01 03:00:00+00:00,Brasilia,451.0,20.5,3.1,33.0,16.4,16.6,20.258335,Brasilia
4,4,2025-01-01 04:00:00+00:00,Brasilia,458.0,22.1,3.0,29.0,17.7,17.8,20.316668,Brasilia


In [68]:
baseline_df.dtypes


Unnamed: 0      int64
Date           object
City           object
CO            float64
NO2           float64
SO2           float64
O3            float64
PM2.5         float64
PM10          float64
AQI           float64
city           object
dtype: object

In [70]:
baseline_profile = {}

# 1. Schema profile
baseline_profile["schema"] = {
    "columns": list(baseline_df.columns),
    "dtypes": baseline_df.dtypes.astype(str).to_dict()
}

# 2. Missing value profile
baseline_profile["missing_values"] = (
    baseline_df.isnull().mean() * 100
).to_dict()

# 3. Numeric statistics
numeric_cols = baseline_df.select_dtypes(include=["int64", "float64"]).columns

baseline_profile["numeric_stats"] = {}

for col in numeric_cols:
    baseline_profile["numeric_stats"][col] = {
        "min": baseline_df[col].min(),
        "max": baseline_df[col].max(),
        "mean": baseline_df[col].mean(),
        "std": baseline_df[col].std()
    }

# 4. Categorical statistics
categorical_cols = baseline_df.select_dtypes(include=["object"]).columns

baseline_profile["categorical_stats"] = {}

for col in categorical_cols:
    baseline_profile["categorical_stats"][col] = {
        "unique_values": baseline_df[col].nunique(),
        "top_values": baseline_df[col].value_counts().head(5).to_dict()
    }

baseline_profile


{'schema': {'columns': ['Unnamed: 0',
   'Date',
   'City',
   'CO',
   'NO2',
   'SO2',
   'O3',
   'PM2.5',
   'PM10',
   'AQI',
   'city'],
  'dtypes': {'Unnamed: 0': 'int64',
   'Date': 'object',
   'City': 'object',
   'CO': 'float64',
   'NO2': 'float64',
   'SO2': 'float64',
   'O3': 'float64',
   'PM2.5': 'float64',
   'PM10': 'float64',
   'AQI': 'float64',
   'city': 'object'}},
 'missing_values': {'Unnamed: 0': 0.0,
  'Date': 0.0,
  'City': 0.0,
  'CO': 0.0,
  'NO2': 0.0,
  'SO2': 0.0,
  'O3': 0.0,
  'PM2.5': 0.0,
  'PM10': 0.0,
  'AQI': 0.0,
  'city': 0.0},
 'numeric_stats': {'Unnamed: 0': {'min': 0,
   'max': 8759,
   'mean': 4379.5,
   'std': 2528.818219178003},
  'CO': {'min': 52.0,
   'max': 1621.0,
   'mean': 245.2697108066971,
   'std': 146.9265879965551},
  'NO2': {'min': 0.0,
   'max': 127.1,
   'mean': 24.12986301369863,
   'std': 19.32385324882059},
  'SO2': {'min': 0.3,
   'max': 478.0,
   'mean': 15.485135083713852,
   'std': 24.794610814311742},
  'O3': {'min':

In [72]:
with open("baseline_profile.json", "w") as f:
    json.dump(baseline_profile, f, indent=4)

print("Baseline profile saved successfully.")


Baseline profile saved successfully.


In [74]:
# Simulate new incoming data
new_df = baseline_df.copy()

# Introduce silent failures intentionally
new_df.loc[new_df.sample(frac=0.1).index, "PM2.5"] *= 2  # statistical drift
new_df.loc[new_df.sample(frac=0.15).index, "PM10"] = None  # missing spike

new_df.head()


Unnamed: 0.1,Unnamed: 0,Date,City,CO,NO2,SO2,O3,PM2.5,PM10,AQI,city
0,0,2025-01-01 00:00:00+00:00,Brasilia,325.0,21.1,2.5,35.0,15.4,15.6,20.483337,Brasilia
1,1,2025-01-01 01:00:00+00:00,Brasilia,369.0,20.8,2.7,35.0,15.1,15.3,20.425,Brasilia
2,2,2025-01-01 02:00:00+00:00,Brasilia,419.0,20.4,3.0,34.0,15.6,15.8,20.333332,Brasilia
3,3,2025-01-01 03:00:00+00:00,Brasilia,451.0,20.5,3.1,33.0,16.4,16.6,20.258335,Brasilia
4,4,2025-01-01 04:00:00+00:00,Brasilia,458.0,22.1,3.0,29.0,17.7,,20.316668,Brasilia


In [76]:
def profile_data(df):
    profile = {}

    profile["schema"] = {
        "columns": list(df.columns),
        "dtypes": df.dtypes.astype(str).to_dict()
    }

    profile["missing_values"] = (df.isnull().mean() * 100).to_dict()

    numeric_cols = df.select_dtypes(include=["int64", "float64"]).columns
    profile["numeric_stats"] = {}

    for col in numeric_cols:
        profile["numeric_stats"][col] = {
            "mean": df[col].mean(),
            "std": df[col].std()
        }

    return profile


new_profile = profile_data(new_df)


In [78]:
import json

with open("baseline_profile.json", "r") as f:
    baseline_profile = json.load(f)


In [80]:
drift_report = {
    "schema_drift": [],
    "missing_value_drift": [],
    "statistical_drift": []
}

# 1. Schema drift
baseline_cols = set(baseline_profile["schema"]["columns"])
new_cols = set(new_profile["schema"]["columns"])

missing_cols = baseline_cols - new_cols
extra_cols = new_cols - baseline_cols

if missing_cols:
    drift_report["schema_drift"].append(f"Missing columns: {missing_cols}")
if extra_cols:
    drift_report["schema_drift"].append(f"New columns detected: {extra_cols}")

# 2. Missing value drift
for col, baseline_missing in baseline_profile["missing_values"].items():
    new_missing = new_profile["missing_values"].get(col, 0)

    if new_missing > baseline_missing + 10:
        drift_report["missing_value_drift"].append(
            f"{col}: missing increased from {baseline_missing:.1f}% to {new_missing:.1f}%"
        )

# 3. Statistical drift (mean shift)
for col, stats in baseline_profile["numeric_stats"].items():
    baseline_mean = stats["mean"]
    new_mean = new_profile["numeric_stats"].get(col, {}).get("mean")

    if new_mean and abs(new_mean - baseline_mean) > 0.3 * abs(baseline_mean):
        drift_report["statistical_drift"].append(
            f"{col}: mean shifted from {baseline_mean:.2f} to {new_mean:.2f}"
        )

drift_report


{'schema_drift': [],
 'missing_value_drift': ['PM10: missing increased from 0.0% to 15.0%'],
 'statistical_drift': []}

In [82]:
def classify_severity(drift_report):
    severity = "INFO"

    if drift_report["schema_drift"]:
        severity = "CRITICAL"
    elif drift_report["missing_value_drift"] or drift_report["statistical_drift"]:
        severity = "WARNING"

    return severity


severity_level = classify_severity(drift_report)

print("=== DRIFT REPORT ===")
print(drift_report)
print(f"\nSEVERITY LEVEL: {severity_level}")


=== DRIFT REPORT ===
{'schema_drift': [], 'missing_value_drift': ['PM10: missing increased from 0.0% to 15.0%'], 'statistical_drift': []}

