In [None]:
import numpy as np

# Given Dataset
data = [
    (101, 45, 78, None),
    (102, 65, 56, 77),
    (103, 95, 85, 92),
    (102, 65, 56, 77),
    (104, 45, None, 88),
    (101, 45, 78, None)
]

# Convert to NumPy Array (dtype=object to allow None)
arr = np.array(data, dtype=object)

# -------------------------------
# 1. Data Inspection
# -------------------------------

# Count missing values per subject (Math, Science, English)
math_missing = np.sum([x is None for x in arr[:, 1]])
science_missing = np.sum([x is None for x in arr[:, 2]])
english_missing = np.sum([x is None for x in arr[:, 3]])

# Detect duplicate roll numbers
roll_numbers = arr[:, 0]
unique_rolls, counts = np.unique(roll_numbers, return_counts=True)
duplicates = unique_rolls[counts > 1]

# -------------------------------
# 2. Data Cleaning
# -------------------------------

clean_arr = arr.copy()

# Replace missing values using subject medians
math_values = np.array([x for x in clean_arr[:, 1] if x is not None])
science_values = np.array([x for x in clean_arr[:, 2] if x is not None])
english_values = np.array([x for x in clean_arr[:, 3] if x is not None])

math_median = np.median(math_values)
science_median = np.median(science_values)
english_median = np.median(english_values)

for i in range(len(clean_arr)):
    if clean_arr[i, 1] is None:
        clean_arr[i, 1] = math_median
    if clean_arr[i, 2] is None:
        clean_arr[i, 2] = science_median
    if clean_arr[i, 3] is None:
        clean_arr[i, 3] = english_median

# Remove duplicate roll numbers, keep first occurrence
_, unique_indices = np.unique(clean_arr[:, 0], return_index=True)
clean_arr = clean_arr[np.sort(unique_indices)]

# -------------------------------
# 3. Data Normalization (Min-Max)
# -------------------------------

scores = clean_arr[:, 1:].astype(float)  # Remove roll numbers

min_vals = np.min(scores, axis=0)
max_vals = np.max(scores, axis=0)

normalized_scores = (scores - min_vals) / (max_vals - min_vals)

# -------------------------------
# 4. Statistical Analysis
# -------------------------------

mean_scores = np.mean(normalized_scores, axis=0)
median_scores = np.median(normalized_scores, axis=0)
std_scores = np.std(normalized_scores, axis=0)

subjects = ["Mathematics", "Science", "English"]
highest_variability_subject = subjects[np.argmax(std_scores)]

# -------------------------------
# Print Results
# -------------------------------

print("Missing Values:")
print("Math:", math_missing, "Science:", science_missing, "English:", english_missing)

print("\nDuplicate Roll Numbers:", duplicates)

print("\nCleaned Data (after removing duplicates & filling missing values):")
print(clean_arr)

print("\nNormalized Scores:")
print(normalized_scores)

print("\nMean Scores:", mean_scores)
print("Median Scores:", median_scores)
print("Standard Deviations:", std_scores)

print("\nSubject with Highest Variability:", highest_variability_subject)


Missing Values:
Math: 0 Science: 1 English: 2

Duplicate Roll Numbers: [101 102]

Cleaned Data (after removing duplicates & filling missing values):
[[101 45 78 np.float64(82.5)]
 [102 65 56 77]
 [103 95 85 92]
 [104 45 np.float64(78.0) 88]]

Normalized Scores:
[[0.         0.75862069 0.36666667]
 [0.4        0.         0.        ]
 [1.         1.         1.        ]
 [0.         0.75862069 0.73333333]]

Mean Scores: [0.35       0.62931034 0.525     ]
Median Scores: [0.2        0.75862069 0.55      ]
Standard Deviations: [0.40926764 0.37645872 0.37739973]

Subject with Highest Variability: Mathematics


In [None]:
import numpy as np

# Dataset
patient_data = [
    (201, 30, 15, 8),
    (202, 45, 25, None),
    (203, None, 30, 7),
    (204, 20, None, 9),
    (202, 45, 25, None),
    (205, 50, 40, 6),
    (203, None, 30, 7)
]

# Convert to numpy array
arr = np.array(patient_data, dtype=object)

# ---------------------------------------------
# 1. Missing Values & Duplicate Detection
# ---------------------------------------------
missing_wait = np.sum([x is None for x in arr[:, 1]])
missing_treat = np.sum([x is None for x in arr[:, 2]])
missing_sat = np.sum([x is None for x in arr[:, 3]])

# Duplicate patient records
patient_ids = arr[:, 0]
unique_ids, counts = np.unique(patient_ids, return_counts=True)
duplicates = unique_ids[counts > 1]

# ---------------------------------------------
# 2. Data Cleaning (Replace missing with median)
# ---------------------------------------------
clean_arr = arr.copy()

wait_vals = np.array([x for x in clean_arr[:, 1] if x is not None])
treat_vals = np.array([x for x in clean_arr[:, 2] if x is not None])
sat_vals = np.array([x for x in clean_arr[:, 3] if x is not None])

wait_med = np.median(wait_vals)
treat_med = np.median(treat_vals)
sat_med = np.median(sat_vals)

for i in range(len(clean_arr)):
    if clean_arr[i][1] is None:
        clean_arr[i][1] = wait_med
    if clean_arr[i][2] is None:
        clean_arr[i][2] = treat_med
    if clean_arr[i][3] is None:
        clean_arr[i][3] = sat_med

# Remove duplicates
_, idx = np.unique(clean_arr[:, 0], return_index=True)
clean_arr = clean_arr[np.sort(idx)]

# ---------------------------------------------
# 3. Normalization (Min-Max)
# ---------------------------------------------
scores = clean_arr[:, 1:].astype(float)

min_vals = np.min(scores, axis=0)
max_vals = np.max(scores, axis=0)

normalized = (scores - min_vals) / (max_vals - min_vals)

# ---------------------------------------------
# 4. Statistical Analysis
# ---------------------------------------------
mean_vals = np.mean(normalized, axis=0)
median_vals = np.median(normalized, axis=0)
std_vals = np.std(normalized, axis=0)

metrics = ["Waiting Time", "Treatment Duration", "Satisfaction Score"]
highest_var_metric = metrics[np.argmax(std_vals)]

# ---------------------------------------------
# PRINT RESULTS
# ---------------------------------------------

print("Missing Values:")
print("Waiting Time:", missing_wait)
print("Treatment Duration:", missing_treat)
print("Satisfaction Score:", missing_sat)

print("\nDuplicate Patient IDs:", duplicates)

print("\nCleaned Data:")
print(clean_arr)

print("\nNormalized Data:")
print(normalized)

print("\nMean:", mean_vals)
print("Median:", median_vals)
print("Standard Deviation:", std_vals)

print("\nMetric with Highest Variability:", highest_var_metric)


Missing Values:
Waiting Time: 2
Treatment Duration: 1
Satisfaction Score: 2

Duplicate Patient IDs: [202 203]

Cleaned Data:
[[201 30 15 8]
 [202 45 25 np.float64(7.0)]
 [203 np.float64(45.0) 30 7]
 [204 20 np.float64(27.5) 9]
 [205 50 40 6]]

Normalized Data:
[[0.33333333 0.         0.66666667]
 [0.83333333 0.4        0.33333333]
 [0.83333333 0.6        0.33333333]
 [0.         0.5        1.        ]
 [1.         1.         0.        ]]

Mean: [0.6        0.5        0.46666667]
Median: [0.83333333 0.5        0.33333333]
Standard Deviation: [0.37416574 0.32249031 0.33993463]

Metric with Highest Variability: Waiting Time
