## Section 1 - Import Libraries

In [None]:
# Section 1: Import Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from google.colab import files
import io

## Section 2 - Import Data Set

In [None]:
# Section 2: Import Data Set

# Data loading function
def load_data_from_upload():
  uploaded = files.upload()
  for file_name in uploaded.keys():
    print(f"File uploaded: {file_name}")
    if file_name.endswith(".csv"):
      df = pd.read_csv(io.BytesIO(uploaded[file_name]))
      print(f"Loaded CSV file with shape {df.shape}")
      return df
    elif file_name.endswith(".json"):
      raw_data = json.load(io.BytesIO(uploaded[file_name]))
      records = []
        for frame in raw_data:
          frame_id = frame["frame_id"]
          for obj in frame["objects"]:
            records.append({
              "frame_id": frame_id,
              "object_name": obj["object_name"],
              "x_min": obj["bounding_box"][0],
              "y_min": obj["bounding_box"][1],
              "x_max": obj["bounding_box"][2],
              "y_max": obj["bounding_box"][3],
              "confidence": obj.get("confidence", None),
              "object_category": obj.get("category", None)
            })
      df = pd.DataFrame(records)
      print(f"Loaded JSON file with shape {df.shape}")
      return df
    else:
        raise ValueError("Unsupported file format. Please upload a .csv or .json file.")

In [None]:
df = load_data_from_upload()

Saving object_annotations_expanded.csv to object_annotations_expanded.csv
📁 File uploaded: object_annotations_expanded.csv
✅ Loaded CSV file with shape (154, 10)


## Section 3 - Schema Validation

In [None]:
# Section 3: Schema Validation

def validate_schema(df, expected_columns):
  actual_columns = set(df.columns.str.lower())
  expected_set = set(expected_columns)

  missing_columns = list(expected_set - actual_columns)
  unexpected_columns = list(actual_columns - expected_set)

  missing_summary = df.isnull().sum()
  dtype_summary = df.dtypes

  print("Schema Validation Report")
  print("--------------------------------------------------")
  print(f"Total Columns Present: {len(df.columns)}")
  print(f"Total Rows: {df.shape[0]}\n")

  if missing_columns:
    print(f"Missing Columns: {missing_columns}")
  else:
    print("All expected columns are present.")

  if unexpected_columns:
    print(f"Unexpected Columns: {unexpected_columns}")
  else:
    print("No unexpected columns.")

  print("\nMissing Values:")
  print(missing_summary[missing_summary > 0] if missing_summary.sum() > 0 else "✅ No missing values.")

  print("\nData Types:")
  print(dtype_summary)

  return {
    "missing_columns": missing_columns,
    "unexpected_columns": unexpected_columns,
    "missing_summary": missing_summary,
    "dtype_summary": dtype_summary
  }

In [None]:
expected_columns = [
 'frame_id',
 'object_name',
 'object_category',
 'x_min',
 'y_min',
 'x_max',
 'y_max',
 'confidence',
 'frame_type',
 'interaction_score'
]

schema_report = validate_schema(df, expected_columns)

📋 Schema Validation Report
--------------------------------------------------
🧾 Total Columns Present: 10
📊 Total Rows: 154

All expected columns are present.
No unexpected columns.

Missing Values:
confidence    12
dtype: int64

Data Types:
frame_id              object
object_name           object
x_min                  int64
y_min                  int64
x_max                  int64
y_max                  int64
confidence           float64
object_category       object
frame_type            object
interaction_score    float64
dtype: object


## Section 4 - Data Cleanup

In [None]:
# Section 4: Data Cleanup

# Define your canonical label mapping
label_reference = {
    "person": "person",
    "laptop": "laptop",
    "whiteboard": "whiteboard",
    "smartphone": "smartphone",
    "coffee_mug": "coffee_mug",
    "tablet": "tablet",
    "desk": "desk",
    "projector_screen": "projector_screen"
}

def normalize_object_name(name):
    if pd.isna(name):
        return np.nan
    name = name.strip().lower().replace(" ", "_")
    return label_reference.get(name, name)

def normalize_columns(df):
  df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

  if "object_name" in df.columns:
    df["object_name"] = df["object_name"].apply(normalize_object_name)

  if "object_category" in df.columns:
    df["object_category"] = df["object_category"].str.strip().str.lower().replace(" ", "_")

  # Optional: fill missing confidence with median or flag it
  if "confidence" in df.columns:
    missing_count = df["confidence"].isnull().sum()
    if missing_count > 0:
      median_conf = df["confidence"].median()
      print(f"Filling {missing_count} missing confidence values with median: {median_conf:.2f}")
      df["confidence"] = df["confidence"].fillna(median_conf)

  return df