# Data Mining MP1 VIRAY — Weather Data Analysis
Covers: Distance Metrics, Mahalanobis Distance, Similarity, Entropy, Mutual Information

In [15]:
import csv
import math
import os

## Data Loading

In [16]:
def load_decoded_data(filepath):
    """Load decoded.csv and return list of dicts with numeric values."""
    data = []
    with open(filepath, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            entry = {
                'no': int(row['Data Entry 2'].strip()),
                'outlook': int(row['outlook'].strip()),
                'temp': int(row['temp'].strip()),
                'humidity': int(row['humidity'].strip()),
                'windy': int(row['windy'].strip()),
                'play': int(row['play'].strip()),
            }
            data.append(entry)
    return data


def get_feature_vector(row):
    """Return the 5-dimensional feature vector for a data point."""
    return [row['outlook'], row['temp'], row['humidity'], row['windy'], row['play']]


data_path = os.path.join(os.getcwd(), 'data', 'decoded.csv')
print(f"Loading data from: {data_path}")
data = load_decoded_data(data_path)
print(f"Loaded {len(data)} data points.")
print("First 3 rows:", [get_feature_vector(d) for d in data[:3]])
print("Last 3 rows:",  [get_feature_vector(d) for d in data[-3:]])

Loading data from: /Users/kenjo/Projects/datamining/mp1/data/decoded.csv
Loaded 70 data points.
First 3 rows: [[1, 1, 1, 2, 2], [1, 1, 1, 1, 2], [2, 1, 1, 2, 1]]
Last 3 rows: [[3, 2, 1, 1, 2], [1, 2, 2, 1, 1], [2, 2, 1, 1, 1]]


## Problem 1: Euclidean, Manhattan, and Minkowski Distance

In [17]:
def euclidean_distance(a, b):
    return math.sqrt(sum((ai - bi) ** 2 for ai, bi in zip(a, b)))


def manhattan_distance(a, b):
    return sum(abs(ai - bi) for ai, bi in zip(a, b))


def minkowski_distance(a, b, r):
    if r == float('inf'):
        return max(abs(ai - bi) for ai, bi in zip(a, b))
    return sum(abs(ai - bi) ** r for ai, bi in zip(a, b)) ** (1.0 / r)


def solve_problem1(data):
    print("=" * 70)
    print("PROBLEM 1: Distance Metrics")
    print("=" * 70)

    features = ['outlook', 'temp', 'humidity', 'windy', 'play']
    n = len(data)

    mean_vec = []
    for feat in features:
        feat_mean = sum(row[feat] for row in data) / n
        mean_vec.append(feat_mean)

    print(f"\nDataset size: {n}")
    print(f"Mean vector: {mean_vec}")
    print(f"  outlook={mean_vec[0]:.4f}, temp={mean_vec[1]:.4f}, humidity={mean_vec[2]:.4f}, "
          f"windy={mean_vec[3]:.4f}, play={mean_vec[4]:.4f}")

    for dp_no in [7, 70]:
        dp = data[dp_no - 1]
        dp_vec = get_feature_vector(dp)
        print(f"\n--- Data Point {dp_no}: {dp_vec} ---")

        # (a) Euclidean
        ed = euclidean_distance(mean_vec, dp_vec)
        print(f"  (a) Euclidean Distance: {ed:.6f}")
        sq_diffs = [(mean_vec[i] - dp_vec[i]) ** 2 for i in range(5)]
        print(f"      Squared diffs: {[f'{d:.6f}' for d in sq_diffs]}")
        print(f"      Sum = {sum(sq_diffs):.6f}, sqrt = {ed:.6f}")

        # (b) Manhattan
        md = manhattan_distance(mean_vec, dp_vec)
        print(f"  (b) Manhattan Distance: {md:.6f}")
        abs_diffs = [abs(mean_vec[i] - dp_vec[i]) for i in range(5)]
        print(f"      Abs diffs: {[f'{d:.6f}' for d in abs_diffs]}")
        print(f"      Sum = {md:.6f}")

        # (c) Minkowski r=5
        mk5 = minkowski_distance(mean_vec, dp_vec, 5)
        print(f"  (c) Minkowski (r=5): {mk5:.6f}")
        pow5_diffs = [abs(mean_vec[i] - dp_vec[i]) ** 5 for i in range(5)]
        print(f"      |diff|^5: {[f'{d:.6f}' for d in pow5_diffs]}")
        print(f"      Sum = {sum(pow5_diffs):.6f}, 5th root = {mk5:.6f}")

        # (c) Minkowski r=infinity (Chebyshev)
        mk_inf = minkowski_distance(mean_vec, dp_vec, float('inf'))
        print(f"  (c) Minkowski (r=\u221e / Chebyshev): {mk_inf:.6f}")
        print(f"      Max of abs diffs: {[f'{d:.6f}' for d in abs_diffs]} \u2192 max = {mk_inf:.6f}")


solve_problem1(data)

PROBLEM 1: Distance Metrics

Dataset size: 70
Mean vector: [2.0428571428571427, 1.9714285714285715, 1.5, 1.5142857142857142, 1.3285714285714285]
  outlook=2.0429, temp=1.9714, humidity=1.5000, windy=1.5143, play=1.3286

--- Data Point 7: [2, 3, 2, 1, 1] ---
  (a) Euclidean Distance: 1.297014
      Squared diffs: ['0.001837', '1.057959', '0.250000', '0.264490', '0.107959']
      Sum = 1.682245, sqrt = 1.297014
  (b) Manhattan Distance: 2.414286
      Abs diffs: ['0.042857', '1.028571', '0.500000', '0.514286', '0.328571']
      Sum = 2.414286
  (c) Minkowski (r=5): 1.040966
      |diff|^5: ['0.000000', '1.151257', '0.031250', '0.035977', '0.003830']
      Sum = 1.222313, 5th root = 1.040966
  (c) Minkowski (r=∞ / Chebyshev): 1.028571
      Max of abs diffs: ['0.042857', '1.028571', '0.500000', '0.514286', '0.328571'] → max = 1.028571

--- Data Point 70: [2, 2, 1, 1, 1] ---
  (a) Euclidean Distance: 0.790634
      Squared diffs: ['0.001837', '0.000816', '0.250000', '0.264490', '0.107959']

## Problem 2: Mahalanobis Distance

In [18]:
def matrix_transpose(m):
    return [[m[j][i] for j in range(len(m))] for i in range(len(m[0]))]


def matrix_multiply(a, b):
    rows_a, cols_a = len(a), len(a[0])
    rows_b, cols_b = len(b), len(b[0])
    assert cols_a == rows_b
    result = [[0.0] * cols_b for _ in range(rows_a)]
    for i in range(rows_a):
        for j in range(cols_b):
            for k in range(cols_a):
                result[i][j] += a[i][k] * b[k][j]
    return result


def matrix_inverse_3x3(m):
    """Compute inverse of a 3x3 matrix using cofactor method."""
    a, b, c = m[0][0], m[0][1], m[0][2]
    d, e, f = m[1][0], m[1][1], m[1][2]
    g, h, i = m[2][0], m[2][1], m[2][2]

    det = a * (e * i - f * h) - b * (d * i - f * g) + c * (d * h - e * g)
    if abs(det) < 1e-12:
        raise ValueError("Matrix is singular, cannot invert")

    inv_det = 1.0 / det

    cofactors = [
        [(e * i - f * h), -(d * i - f * g), (d * h - e * g)],
        [-(b * i - c * h), (a * i - c * g), -(a * h - b * g)],
        [(b * f - c * e), -(a * f - c * d), (a * e - b * d)],
    ]

    inverse = [[cofactors[i][j] * inv_det for j in range(3)] for i in range(3)]
    return inverse


def solve_problem2(data):
    print("\n" + "=" * 70)
    print("PROBLEM 2: Mahalanobis Distance")
    print("=" * 70)

    subset = data[:8]
    features = ['outlook', 'temp', 'humidity']

    X = [[row[f] for f in features] for row in subset]
    n = len(X)
    p = len(features)

    print(f"\nData (rows 1-8, columns: {features}):")
    for i, row in enumerate(X):
        print(f"  Row {i+1}: {row}")

    mean = [sum(X[i][j] for i in range(n)) / n for j in range(p)]
    print(f"\nStep 1: Mean vector = {[f'{m:.4f}' for m in mean]}")

    X_centered = [[X[i][j] - mean[j] for j in range(p)] for i in range(n)]
    print(f"\nStep 2: Centered data (X - mean):")
    for i, row in enumerate(X_centered):
        print(f"  Row {i+1}: [{', '.join(f'{v:.4f}' for v in row)}]")

    cov = [[0.0] * p for _ in range(p)]
    for i in range(p):
        for j in range(p):
            cov[i][j] = sum(X_centered[k][i] * X_centered[k][j] for k in range(n)) / (n - 1)

    print(f"\nStep 3: Covariance matrix (using n-1 = {n-1}):")
    for row in cov:
        print(f"  [{', '.join(f'{v:.6f}' for v in row)}]")

    cov_inv = matrix_inverse_3x3(cov)
    print(f"\nStep 4: Inverse covariance matrix:")
    for row in cov_inv:
        print(f"  [{', '.join(f'{v:.6f}' for v in row)}]")

    identity_check = matrix_multiply(cov, cov_inv)
    print(f"\n  Verification (Cov \u00d7 Cov\u207b\u00b9 \u2248 I):")
    for row in identity_check:
        print(f"    [{', '.join(f'{v:.4f}' for v in row)}]")

    test_point = [2, 1, 2]
    print(f"\nStep 5: Test point (overcast, hot, normal) = {test_point}")

    diff = [test_point[j] - mean[j] for j in range(p)]
    print(f"  Difference (x - mean) = [{', '.join(f'{v:.4f}' for v in diff)}]")

    diff_col = [[d] for d in diff]
    sinv_diff = matrix_multiply(cov_inv, diff_col)
    print(f"  S\u207b\u00b9 \u00d7 (x - mean) = [{', '.join(f'{v[0]:.6f}' for v in sinv_diff)}]")

    diff_row = [diff]
    result = matrix_multiply(diff_row, sinv_diff)
    mahal_sq = result[0][0]
    mahal = math.sqrt(mahal_sq)

    print(f"\n  (x - mean)\u1d40 \u00d7 S\u207b\u00b9 \u00d7 (x - mean) = {mahal_sq:.6f}")
    print(f"  Mahalanobis Distance = \u221a{mahal_sq:.6f} = {mahal:.6f}")


solve_problem2(data)


PROBLEM 2: Mahalanobis Distance

Data (rows 1-8, columns: ['outlook', 'temp', 'humidity']):
  Row 1: [1, 1, 1]
  Row 2: [1, 1, 1]
  Row 3: [2, 1, 1]
  Row 4: [3, 2, 1]
  Row 5: [3, 3, 2]
  Row 6: [3, 3, 2]
  Row 7: [2, 3, 2]
  Row 8: [1, 2, 1]

Step 1: Mean vector = ['2.0000', '2.0000', '1.3750']

Step 2: Centered data (X - mean):
  Row 1: [-1.0000, -1.0000, -0.3750]
  Row 2: [-1.0000, -1.0000, -0.3750]
  Row 3: [0.0000, -1.0000, -0.3750]
  Row 4: [1.0000, 0.0000, -0.3750]
  Row 5: [1.0000, 1.0000, 0.6250]
  Row 6: [1.0000, 1.0000, 0.6250]
  Row 7: [0.0000, 1.0000, 0.6250]
  Row 8: [-1.0000, 0.0000, -0.3750]

Step 3: Covariance matrix (using n-1 = 7):
  [0.857143, 0.571429, 0.285714]
  [0.571429, 0.857143, 0.428571]
  [0.285714, 0.428571, 0.267857]

Step 4: Inverse covariance matrix:
  [2.100000, -1.400000, 0.000000]
  [-1.400000, 6.766667, -9.333333]
  [0.000000, -9.333333, 18.666667]

  Verification (Cov × Cov⁻¹ ≈ I):
    [1.0000, 0.0000, 0.0000]
    [0.0000, 1.0000, 0.0000]
    [0.

## Problem 3: Cosine and Extended Jaccard Similarity

In [19]:
def cosine_similarity(a, b):
    dot = sum(ai * bi for ai, bi in zip(a, b))
    mag_a = math.sqrt(sum(ai ** 2 for ai in a))
    mag_b = math.sqrt(sum(bi ** 2 for bi in b))
    return dot / (mag_a * mag_b)


def extended_jaccard_similarity(a, b):
    dot = sum(ai * bi for ai, bi in zip(a, b))
    mag_a_sq = sum(ai ** 2 for ai in a)
    mag_b_sq = sum(bi ** 2 for bi in b)
    return dot / (mag_a_sq + mag_b_sq - dot)


def solve_problem3(data):
    print("\n" + "=" * 70)
    print("PROBLEM 3: Cosine and Extended Jaccard Similarity")
    print("=" * 70)

    dp20 = get_feature_vector(data[19])
    dp27 = get_feature_vector(data[26])

    print(f"\nData Point 20: {dp20}")
    print(f"Data Point 27: {dp27}")

    dot = sum(a * b for a, b in zip(dp20, dp27))
    mag20_sq = sum(a ** 2 for a in dp20)
    mag27_sq = sum(a ** 2 for a in dp27)
    mag20 = math.sqrt(mag20_sq)
    mag27 = math.sqrt(mag27_sq)

    print(f"\n  Dot product (x\u00b7y) = {dot}")
    print(f"  ||x||\u00b2 = {mag20_sq}, ||y||\u00b2 = {mag27_sq}")
    print(f"  ||x|| = {mag20:.6f}, ||y|| = {mag27:.6f}")

    cos_sim = cosine_similarity(dp20, dp27)
    print(f"\n  (a) Cosine Similarity = {dot} / ({mag20:.6f} \u00d7 {mag27:.6f})")
    print(f"      = {dot} / {mag20 * mag27:.6f}")
    print(f"      = {cos_sim:.6f}")

    ej_sim = extended_jaccard_similarity(dp20, dp27)
    denom = mag20_sq + mag27_sq - dot
    print(f"\n  (b) Extended Jaccard Similarity = {dot} / ({mag20_sq} + {mag27_sq} - {dot})")
    print(f"      = {dot} / {denom}")
    print(f"      = {ej_sim:.6f}")


solve_problem3(data)


PROBLEM 3: Cosine and Extended Jaccard Similarity

Data Point 20: [3, 2, 1, 1, 2]
Data Point 27: [2, 3, 2, 1, 1]

  Dot product (x·y) = 17
  ||x||² = 19, ||y||² = 19
  ||x|| = 4.358899, ||y|| = 4.358899

  (a) Cosine Similarity = 17 / (4.358899 × 4.358899)
      = 17 / 19.000000
      = 0.894737

  (b) Extended Jaccard Similarity = 17 / (19 + 19 - 17)
      = 17 / 21
      = 0.809524


## Problem 4: Entropy of Temperature and Humidity

In [20]:
def entropy(values):
    """Calculate Shannon entropy of a list of values."""
    n = len(values)
    counts = {}
    for v in values:
        counts[v] = counts.get(v, 0) + 1
    ent = 0.0
    for label, count in sorted(counts.items()):
        p = count / n
        if p > 0:
            ent -= p * math.log2(p)
    return ent, counts


def solve_problem4(data):
    print("\n" + "=" * 70)
    print("PROBLEM 4: Entropy of Temperature and Humidity")
    print("=" * 70)

    n = len(data)
    temp_labels = {1: 'hot', 2: 'mild', 3: 'cool'}
    humidity_labels = {1: 'high', 2: 'normal'}

    for attr, labels in [('temp', temp_labels), ('humidity', humidity_labels)]:
        values = [row[attr] for row in data]
        ent, counts = entropy(values)
        num_classes = len(counts)
        max_ent = math.log2(num_classes)

        print(f"\n--- (a) Entropy of '{attr}' ---" if attr == 'temp' else f"\n--- (b) Entropy of '{attr}' ---")
        print(f"  Total data points: {n}")
        print(f"  Value counts:")
        for val in sorted(counts.keys()):
            print(f"    {labels[val]} ({val}): {counts[val]} \u2192 p = {counts[val]}/{n} = {counts[val]/n:.6f}")

        print(f"\n  H({attr}) = ", end="")
        for val in sorted(counts.keys()):
            p = counts[val] / n
            term_val = -p * math.log2(p)
            print(f"{term_val:.6f}", end=" + " if val != max(counts.keys()) else "")
        print(f"\n  H({attr}) = {ent:.6f} bits")
        print(f"\n  Number of classes = {num_classes}")
        print(f"  Maximum entropy = log\u2082({num_classes}) = {max_ent:.6f} bits")


solve_problem4(data)


PROBLEM 4: Entropy of Temperature and Humidity

--- (a) Entropy of 'temp' ---
  Total data points: 70
  Value counts:
    hot (1): 18 → p = 18/70 = 0.257143
    mild (2): 36 → p = 36/70 = 0.514286
    cool (3): 16 → p = 16/70 = 0.228571

  H(temp) = 0.503835 + 0.493384 + 0.486693
  H(temp) = 1.483912 bits

  Number of classes = 3
  Maximum entropy = log₂(3) = 1.584963 bits

--- (b) Entropy of 'humidity' ---
  Total data points: 70
  Value counts:
    high (1): 35 → p = 35/70 = 0.500000
    normal (2): 35 → p = 35/70 = 0.500000

  H(humidity) = 0.500000 + 0.500000
  H(humidity) = 1.000000 bits

  Number of classes = 2
  Maximum entropy = log₂(2) = 1.000000 bits


## Problem 5: Mutual Information of Outlook and Temperature

In [21]:
def solve_problem5(data):
    print("\n" + "=" * 70)
    print("PROBLEM 5: Mutual Information of Outlook and Temperature")
    print("=" * 70)

    n = len(data)
    outlook_labels = {1: 'sunny', 2: 'overcast', 3: 'rainy'}
    temp_labels = {1: 'hot', 2: 'mild', 3: 'cool'}

    outlook_counts = {}
    temp_counts = {}
    joint_counts = {}

    for row in data:
        o = row['outlook']
        t = row['temp']
        outlook_counts[o] = outlook_counts.get(o, 0) + 1
        temp_counts[t] = temp_counts.get(t, 0) + 1
        key = (o, t)
        joint_counts[key] = joint_counts.get(key, 0) + 1

    print(f"\nTotal data points: {n}")

    print(f"\nMarginal Distribution - Outlook:")
    for val in sorted(outlook_counts.keys()):
        p = outlook_counts[val] / n
        print(f"  P({outlook_labels[val]}) = {outlook_counts[val]}/{n} = {p:.6f}")

    print(f"\nMarginal Distribution - Temperature:")
    for val in sorted(temp_counts.keys()):
        p = temp_counts[val] / n
        print(f"  P({temp_labels[val]}) = {temp_counts[val]}/{n} = {p:.6f}")

    print(f"\nJoint Distribution P(Outlook, Temperature):")
    header = "  {:>12s}".format("")
    for t_val in sorted(temp_counts.keys()):
        header += f"  {temp_labels[t_val]:>10s}"
    header += f"  {'Total':>10s}"
    print(header)

    for o_val in sorted(outlook_counts.keys()):
        row_str = f"  {outlook_labels[o_val]:>12s}"
        for t_val in sorted(temp_counts.keys()):
            count = joint_counts.get((o_val, t_val), 0)
            row_str += f"  {count:>10d}"
        row_str += f"  {outlook_counts[o_val]:>10d}"
        print(row_str)

    total_row = f"  {'Total':>12s}"
    for t_val in sorted(temp_counts.keys()):
        total_row += f"  {temp_counts[t_val]:>10d}"
    total_row += f"  {n:>10d}"
    print(total_row)

    h_outlook = 0.0
    for val in sorted(outlook_counts.keys()):
        p = outlook_counts[val] / n
        if p > 0:
            h_outlook -= p * math.log2(p)
    print(f"\nH(Outlook) = {h_outlook:.6f} bits")

    h_temp = 0.0
    for val in sorted(temp_counts.keys()):
        p = temp_counts[val] / n
        if p > 0:
            h_temp -= p * math.log2(p)
    print(f"H(Temperature) = {h_temp:.6f} bits")

    h_joint = 0.0
    print(f"\nJoint Entropy H(Outlook, Temperature) calculation:")
    for o_val in sorted(outlook_counts.keys()):
        for t_val in sorted(temp_counts.keys()):
            count = joint_counts.get((o_val, t_val), 0)
            if count > 0:
                p_joint = count / n
                term = -p_joint * math.log2(p_joint)
                h_joint += term
                print(f"  P({outlook_labels[o_val]},{temp_labels[t_val]}) = {count}/{n} = {p_joint:.6f}, "
                      f"-p\u00b7log\u2082(p) = {term:.6f}")
    print(f"\nH(Outlook, Temperature) = {h_joint:.6f} bits")

    mi = h_outlook + h_temp - h_joint
    print(f"\nMutual Information I(Outlook; Temperature):")
    print(f"  I = H(Outlook) + H(Temperature) - H(Outlook, Temperature)")
    print(f"  I = {h_outlook:.6f} + {h_temp:.6f} - {h_joint:.6f}")
    print(f"  I = {mi:.6f} bits")

    print(f"\nVerification using definition: I = \u03a3 p(x,y) \u00b7 log\u2082(p(x,y) / (p(x)\u00b7p(y)))")
    mi_verify = 0.0
    for o_val in sorted(outlook_counts.keys()):
        for t_val in sorted(temp_counts.keys()):
            count = joint_counts.get((o_val, t_val), 0)
            if count > 0:
                p_joint = count / n
                p_o = outlook_counts[o_val] / n
                p_t = temp_counts[t_val] / n
                term = p_joint * math.log2(p_joint / (p_o * p_t))
                mi_verify += term
                print(f"  P({outlook_labels[o_val]},{temp_labels[t_val]}) = {p_joint:.6f}, "
                      f"P({outlook_labels[o_val]})\u00b7P({temp_labels[t_val]}) = {p_o:.6f}\u00d7{p_t:.6f} = {p_o*p_t:.6f}, "
                      f"term = {term:.6f}")
    print(f"\n  I(Outlook; Temperature) = {mi_verify:.6f} bits")


solve_problem5(data)


PROBLEM 5: Mutual Information of Outlook and Temperature

Total data points: 70

Marginal Distribution - Outlook:
  P(sunny) = 23/70 = 0.328571
  P(overcast) = 21/70 = 0.300000
  P(rainy) = 26/70 = 0.371429

Marginal Distribution - Temperature:
  P(hot) = 18/70 = 0.257143
  P(mild) = 36/70 = 0.514286
  P(cool) = 16/70 = 0.228571

Joint Distribution P(Outlook, Temperature):
                       hot        mild        cool       Total
         sunny           8          11           4          23
      overcast          10           7           4          21
         rainy           0          18           8          26
         Total          18          36          16          70

H(Outlook) = 1.579397 bits
H(Temperature) = 1.483912 bits

Joint Entropy H(Outlook, Temperature) calculation:
  P(sunny,hot) = 8/70 = 0.114286, -p·log₂(p) = 0.357632
  P(sunny,mild) = 11/70 = 0.157143, -p·log₂(p) = 0.419548
  P(sunny,cool) = 4/70 = 0.057143, -p·log₂(p) = 0.235959
  P(overcast,hot) = 10/70 