# Data Quality Checks: Completeness, Uniqueness, Validity

**Note:** The column `Index (Row number)` is used only as a reference and is excluded from all checks.

In [None]:
import pandas as pd

# Load the CSV file
file_path = 'Planview Data.csv'
df = pd.read_csv(file_path)
df.head()

## Task 1: Completeness Check (Excludes 'Index (Row number)')

In [None]:
# Drop the index column for quality checks
columns_to_check = df.columns.drop("Index (Row number)")

# Convert to long format and check completeness
df_long = df.reset_index().melt(id_vars=["Index (Row number)"], value_vars=columns_to_check,
                                var_name="Column Name", value_name="Value")
df_long["Complete"] = df_long["Value"].notna() & (df_long["Value"].astype(str).str.strip() != "")
df_long["Complete"] = df_long["Complete"].astype(int)

# Base output structure
df_output = df_long[["Column Name", "Index (Row number)", "Complete"]].copy()
df_output["Unique"] = None
df_output["Validity"] = None

## Task 2: Uniqueness Check

In [None]:
uniqueness_columns = ["Work_ID", "JA_Code_Task_ID", "semblance_ID", "CRI_ID", "CRI_ID.1", "CRI_ID.2"]
non_unique_entries = []

for col in uniqueness_columns:
    if col in df.columns:
        col_values = df[col]
        value_counts = col_values.value_counts(dropna=False)
        unique_flags = col_values.map(lambda x: 1 if value_counts[x] == 1 else 0)

        mask = df_output["Column Name"] == col
        df_output.loc[mask, "Unique"] = unique_flags.values

        # Collect non-unique entries
        non_unique_mask = col_values.isin(value_counts[value_counts > 1].index)
        non_unique_df = df.loc[non_unique_mask, ["Index (Row number)", col]].copy()
        non_unique_df["Column Name"] = col
        non_unique_df.rename(columns={col: "Value"}, inplace=True)
        non_unique_entries.append(non_unique_df)

non_unique_results = pd.concat(non_unique_entries, ignore_index=True)

## Task 3: Validation Check

In [None]:
valid_work_status = {"Open/Active", "Completed", "Cancelled", "Denied", "New/Requested"}
valid_complexity = {"Low", "Medium", "High"}
valid_colors = {"Amber", "Blue", "Red", "Green"}

# Work_Status
if "Work_Status.1" in df.columns:
    valid_mask = df["Work_Status.1"].isin(valid_work_status)
    df_output.loc[df_output["Column Name"] == "Work_Status.1", "Validity"] = valid_mask.astype(int).values

# Complexity
if "Complexity" in df.columns:
    valid_mask = df["Complexity"].isin(valid_complexity)
    df_output.loc[df_output["Column Name"] == "Complexity", "Validity"] = valid_mask.astype(int).values

# Columns AJ to AO (columns 35 to 40)
aj_to_ao_cols = df.columns[35:41]
for col in aj_to_ao_cols:
    if col in df.columns:
        valid_mask = df[col].isin(valid_colors)
        df_output.loc[df_output["Column Name"] == col, "Validity"] = valid_mask.astype(int).values

df_output["Validity"] = df_output["Validity"].where(df_output["Validity"].notna(), None)

## Split Output into 4 Parts

In [None]:
total_rows = len(df_output)
split_size = total_rows // 4
for i in range(4):
    start_idx = i * split_size
    end_idx = (i + 1) * split_size if i < 3 else total_rows
    part_df = df_output.iloc[start_idx:end_idx]
    part_df.to_csv(f"final_output_part_{i + 1}.csv", index=False)

## Save Non-Unique Entries (Optional)

In [None]:
non_unique_results.to_csv("non_unique_entries.csv", index=False)