# **Preprocessing Steps**

Binning Algorithms


In [13]:
# Sample Data
data = [4, 8, 9, 15, 21, 21, 24, 25, 26, 28, 29, 34]

def equal_width_binning(data, num_bins):
    min_val = min(data)
    max_val = max(data)
    bin_width = (max_val - min_val) / num_bins

    bins = [[] for _ in range(num_bins)]

    for value in data:
        bin_index = int((value - min_val) / bin_width)
        # Place max value in the last bin
        if bin_index >= num_bins:
            bin_index = num_bins - 1
        bins[bin_index].append(value)

    return bins

# Example Usage
binned_data = equal_width_binning(data, 3)
print(f"Equal-Width Bins: {binned_data}")

Equal-Width Bins: [[4, 8, 9], [15, 21, 21], [24, 25, 26, 28, 29, 34]]


Min-Max Normalization


In [14]:
# Sample Data
data = [200, 300, 400, 600, 1000]

def min_max_normalize(data):
    min_val = min(data)
    max_val = max(data)
    data_range = max_val - min_val

    if data_range == 0:
        return [0.5 for _ in data] # Or handle as an error

    normalized_data = []
    for value in data:
        normalized_value = (value - min_val) / data_range
        normalized_data.append(normalized_value)

    return normalized_data

# Example Usage
normalized = min_max_normalize(data)
print(f"Min-Max Normalized Data: {normalized}")

Min-Max Normalized Data: [0.0, 0.125, 0.25, 0.5, 1.0]


Hypothesis Testing (Student's t-test)


In [15]:
# Sample Data
sample1 = [21.5, 24.5, 23.6, 28.9, 25.1]
sample2 = [20.1, 22.5, 21.7, 25.4, 23.8]

def mean(data):
    return sum(data) / len(data)

def std_dev(data):
    n = len(data)
    if n < 2:
        return 0
    mu = mean(data)
    variance = sum([(x - mu) ** 2 for x in data]) / (n - 1)
    return variance ** 0.5

def independent_t_test(sample1, sample2):
    n1, n2 = len(sample1), len(sample2)
    mean1, mean2 = mean(sample1), mean(sample2)
    std1, std2 = std_dev(sample1), std_dev(sample2)

    # Pooled standard deviation
    sp = (((n1 - 1) * std1**2 + (n2 - 1) * std2**2) / (n1 + n2 - 2)) ** 0.5

    # t-statistic
    t_stat = (mean1 - mean2) / (sp * ((1/n1) + (1/n2))**0.5)
    return t_stat

# Example Usage
t_statistic = independent_t_test(sample1, sample2)
print(f"T-statistic: {t_statistic}")

T-statistic: 1.337953738763802


Chi-Square Test (for Independence)


In [16]:
# Sample Data: Observed frequencies in a contingency table
# Format: [[row1_col1, row1_col2], [row2_col1, row2_col2]]
contingency_table = [[50, 20], [30, 40]] # Example: Smoker vs. Non-smoker and Lung Disease vs. No Disease

def chi_square_test(table):
    num_rows = len(table)
    num_cols = len(table[0])

    # Calculate totals
    row_totals = [sum(row) for row in table]
    col_totals = [sum(row[i] for row in table) for i in range(num_cols)]
    grand_total = sum(row_totals)

    chi_square_stat = 0

    for i in range(num_rows):
        for j in range(num_cols):
            # Calculate expected frequency
            expected = (row_totals[i] * col_totals[j]) / grand_total
            observed = table[i][j]
            chi_square_stat += ((observed - expected) ** 2) / expected

    return chi_square_stat

# Example Usage
chi_square_value = chi_square_test(contingency_table)
print(f"Chi-Square Statistic: {chi_square_value}")

Chi-Square Statistic: 11.666666666666668


Confusion Matrix


In [17]:
# Sample Data
y_true = [1, 0, 1, 1, 0, 1, 0, 0, 1, 0]
y_pred = [1, 1, 1, 0, 0, 1, 0, 1, 1, 0]

def confusion_matrix_metrics(y_true, y_pred):
    tp, tn, fp, fn = 0, 0, 0, 0

    for actual, predicted in zip(y_true, y_pred):
        if actual == 1 and predicted == 1:
            tp += 1
        elif actual == 0 and predicted == 0:
            tn += 1
        elif actual == 0 and predicted == 1:
            fp += 1
        elif actual == 1 and predicted == 0:
            fn += 1

    # Metrics
    accuracy = (tp + tn) / len(y_true) if len(y_true) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    print("Confusion Matrix:")
    print(f"        Pred_Pos  Pred_Neg")
    print(f"Act_Pos [ {tp:^6} ] [ {fn:^6} ]")
    print(f"Act_Neg [ {fp:^6} ] [ {tn:^6} ]")
    print("\nMetrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1_score:.4f}")

# Example Usage
confusion_matrix_metrics(y_true, y_pred)

Confusion Matrix:
        Pred_Pos  Pred_Neg
Act_Pos [   4    ] [   1    ]
Act_Neg [   2    ] [   3    ]

Metrics:
Accuracy: 0.7000
Precision: 0.6667
Recall: 0.8000
F1 Score: 0.7273


Implement Dimensionality reduction using Principle component Analysis method on a dataset iris


In [18]:
import math

# --- 1. The Iris Dataset ---
# Features: [sepal_length, sepal_width, petal_length, petal_width]
# Using a subset for clarity
X = [
    [5.1, 3.5, 1.4, 0.2], [4.9, 3.0, 1.4, 0.2], [4.7, 3.2, 1.3, 0.2],
    [7.0, 3.2, 4.7, 1.4], [6.4, 3.2, 4.5, 1.5], [6.9, 3.1, 4.9, 1.5],
    [6.3, 3.3, 6.0, 2.5], [5.8, 2.7, 5.1, 1.9], [7.1, 3.0, 5.9, 2.1]
]
# Labels for coloring the plot later (0=Setosa, 1=Versicolor, 2=Virginica)
y = [0, 0, 0, 1, 1, 1, 2, 2, 2]


# --- 2. Helper Functions (No external libraries for math) ---
def mean(data):
    return sum(data) / len(data)

def std_dev(data):
    mu = mean(data)
    return math.sqrt(sum([(x - mu)**2 for x in data]) / (len(data) - 1))

def transpose(matrix):
    return [[matrix[j][i] for j in range(len(matrix))] for i in range(len(matrix[0]))]

def dot(v1, v2):
    return sum(x*y for x, y in zip(v1, v2))

def matrix_multiply(m1, m2):
    m2_T = transpose(m2)
    return [[dot(row1, col2) for col2 in m2_T] for row1 in m1]

# --- 3. Core PCA Functions ---
def standardize_data(X):
    X_T = transpose(X)
    means = [mean(col) for col in X_T]
    stds = [std_dev(col) for col in X_T]

    res = []
    for row in X:
        new_row = [(row[i] - means[i]) / stds[i] for i in range(len(row))]
        res.append(new_row)
    return res

def get_covariance_matrix(X_std):
    n_samples = len(X_std)
    X_std_T = transpose(X_std)
    return [[dot(col1, col2) / (n_samples - 1) for col2 in X_std_T] for col1 in X_std_T]

def power_iteration(cov_matrix):
    # Find the first principal component (eigenvector)
    num_features = len(cov_matrix)
    # Start with a random vector
    b = [1.0] * num_features

    for _ in range(100): # Iterate to converge
        # Matrix-vector multiplication
        b_next = [dot(row, b) for row in cov_matrix]
        # Normalize
        norm = math.sqrt(sum(x*x for x in b_next))
        b = [x / norm for x in b_next]
    return b

def get_eigenvectors(cov_matrix, n_components):
    eigenvectors = []
    # Make a copy to modify
    temp_cov = [row[:] for row in cov_matrix]

    for _ in range(n_components):
        # Find the eigenvector
        vec = power_iteration(temp_cov)
        eigenvectors.append(vec)

        # Deflation: remove the found component from the matrix
        eigenvalue = dot([dot(row, vec) for row in temp_cov], vec)
        outer_product = [[v1*v2 for v2 in vec] for v1 in vec]

        temp_cov = [
            [temp_cov[i][j] - eigenvalue * outer_product[i][j] for j in range(len(vec))]
            for i in range(len(vec))
        ]
    return eigenvectors

def project_data(X_std, components):
    projection_matrix = transpose(components)
    return matrix_multiply(X_std, projection_matrix)


# --- 4. Running PCA on the Iris Data ---
# Standardize the dataset
X_standardized = standardize_data(X)

# Calculate the covariance matrix
covariance_matrix = get_covariance_matrix(X_standardized)

# Get the top 2 eigenvectors (principal components)
top_2_components = get_eigenvectors(covariance_matrix, 2)

# Project the data into 2D
X_reduced = project_data(X_standardized, top_2_components)

# Print the result
print("Original number of features:", len(X[0]))
print("Reduced number of features:", len(X_reduced[0]))
print("\nFirst 5 rows of the new 2D dataset:")
for row in X_reduced[:5]:
    print([round(val, 4) for val in row])

Original number of features: 4
Reduced number of features: 2

First 5 rows of the new 2D dataset:
[-2.3455, 1.1317]
[-1.9404, -1.0635]
[-2.2902, -0.2743]
[0.7965, 0.6176]
[0.4633, 0.4336]


Implement and demonstrate the FIND-S algorithm for finding the most specific hypothesis based on a given set of training data samples. Read the training data from a .CSV file.

In [19]:
import csv

# The training data for the "Enjoy Sport" concept
training_data = [
    ['Sky', 'AirTemp', 'Humidity', 'Wind', 'Water', 'Forecast', 'EnjoySport'],
    ['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same', 'Yes'],
    ['Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same', 'Yes'],
    ['Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change', 'No'],
    ['Sunny', 'Warm', 'High', 'Strong', 'Cool', 'Change', 'Yes']
]

# Write the data to a .csv file
with open('enjoysport.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(training_data)

print("enjoysport.csv file created successfully.")

enjoysport.csv file created successfully.


In [20]:
import csv

def find_s_algorithm(training_data):
    """
    Implements the FIND-S algorithm from scratch.
    """
    # Read the header to find the number of attributes
    header = training_data[0]
    num_attributes = len(header) - 1

    # 1. Initialize hypothesis 'h' to the most specific possible hypothesis
    # We use '0' to represent the null or empty value ∅
    h = ['0'] * num_attributes
    print(f"Initial Hypothesis: {h}\n")

    # Isolate the training examples (without the header)
    examples = training_data[1:]

    # 2. Iterate through each training example
    for i, row in enumerate(examples):
        # Separate attributes from the target concept
        attributes = row[:-1]
        target = row[-1]

        print(f"--- Processing Example {i+1} ---")
        print(f"Data: {attributes}, Target: {target}")

        # 3. If the example is a positive instance
        if target.lower() == 'yes':
            for j in range(num_attributes):
                # 4. Generalize the hypothesis
                if h[j] == '0':
                    # This is the first positive example
                    h[j] = attributes[j]
                elif h[j] != attributes[j]:
                    # The attribute value is different, so generalize with '?'
                    h[j] = '?'
            print(f"Hypothesis updated: {h}\n")
        else:
            # 5. If the example is negative, ignore it
            print("Negative example. Hypothesis remains unchanged.\n")

    return h

# --- Main execution ---
# Read the training data from the CSV file
with open('enjoysport.csv', 'r') as f:
    reader = csv.reader(f)
    data = list(reader)

# Run the FIND-S algorithm
final_hypothesis = find_s_algorithm(data)

# Print the final result
print("-----------------------------------------")
print(f"The final, most specific hypothesis is: {final_hypothesis}")
print("-----------------------------------------")

Initial Hypothesis: ['0', '0', '0', '0', '0', '0']

--- Processing Example 1 ---
Data: ['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same'], Target: Yes
Hypothesis updated: ['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same']

--- Processing Example 2 ---
Data: ['Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same'], Target: Yes
Hypothesis updated: ['Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same']

--- Processing Example 3 ---
Data: ['Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change'], Target: No
Negative example. Hypothesis remains unchanged.

--- Processing Example 4 ---
Data: ['Sunny', 'Warm', 'High', 'Strong', 'Cool', 'Change'], Target: Yes
Hypothesis updated: ['Sunny', 'Warm', '?', 'Strong', '?', '?']

-----------------------------------------
The final, most specific hypothesis is: ['Sunny', 'Warm', '?', 'Strong', '?', '?']
-----------------------------------------


For a given set of training data examples stored in a .CSV file, implement and demonstrate the Candidate-Elimination algorithm to output a description of the set of all hypotheses consistent with the training examples.


In [21]:
import csv

# The training data for the "Enjoy Sport" concept
training_data = [
    ['Sky', 'AirTemp', 'Humidity', 'Wind', 'Water', 'Forecast', 'EnjoySport'],
    ['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same', 'Yes'],
    ['Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same', 'Yes'],
    ['Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change', 'No'],
    ['Sunny', 'Warm', 'High', 'Strong', 'Cool', 'Change', 'Yes']
]

# Write the data to a .csv file
with open('enjoysport.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(training_data)

print("enjoysport.csv file created successfully.")

enjoysport.csv file created successfully.


In [22]:
import csv

def get_domains(examples):
    """Gets the unique attribute values for each column."""
    # Transpose the data to get columns
    d = [list(set(col)) for col in zip(*examples)]
    # We only care about the attributes, not the target concept
    return d[:-1]

def is_consistent(h, example):
    """Checks if a hypothesis is consistent with a single example."""
    # The last element is the target concept
    if example[-1].lower() == 'no':
        # If it's a negative example, h must NOT match
        return not covers(h, example)
    else:
        # If it's a positive example, h MUST match
        return covers(h, example)

def covers(h, example):
    """Checks if a hypothesis covers (matches) an example."""
    for i in range(len(h)):
        if h[i] != '?' and h[i] != example[i]:
            return False
    return True

def generalize_S(h, example):
    """Generalizes a hypothesis in S based on a positive example."""
    h_new = list(h)
    for i in range(len(h)):
        if h[i] == '0':
            h_new[i] = example[i]
        elif h[i] != example[i]:
            h_new[i] = '?'
    return tuple(h_new)

def specialize_G(h, domains, example):
    """Specializes a hypothesis in G based on a negative example."""
    results = []
    for i in range(len(h)):
        if h[i] == '?':
            for val in domains[i]:
                if val != example[i]:
                    h_new = list(h)
                    h_new[i] = val
                    results.append(tuple(h_new))
    return results

def candidate_elimination(examples):
    domains = get_domains(examples)
    num_attributes = len(domains)

    # Initialize S and G
    S = {tuple(['0'] * num_attributes)}
    G = {tuple(['?'] * num_attributes)}

    print("Initial S:", S)
    print("Initial G:", G)
    print("-" * 30)

    for i, example in enumerate(examples):
        print(f"Processing Example {i+1}: {example}")
        attributes = example[:-1]
        target = example[-1]

        if target.lower() == 'yes': # Positive Example
            # 1. Remove inconsistent hypotheses from G
            G = {g for g in G if covers(g, attributes)}

            # 2. Generalize S
            S_new = set()
            for s in S:
                if not covers(s, attributes):
                    s_generalized = generalize_S(s, attributes)
                    # Add to S_new only if it's consistent with G
                    if any(all(g_i == '?' or g_i == s_g_i for g_i, s_g_i in zip(g, s_generalized)) for g in G):
                        S_new.add(s_generalized)
                else:
                    S_new.add(s)
            S = S_new

        else: # Negative Example
            # 1. Remove inconsistent hypotheses from S
            S = {s for s in S if not covers(s, attributes)}

            # 2. Specialize G
            G_new = set()
            for g in G:
                if covers(g, attributes):
                    for h in specialize_G(g, domains, attributes):
                        # Add to G_new only if it's consistent with S
                        if any(all(h_i == '?' or h_i == s_i for h_i, s_i in zip(h, s)) for s in S):
                            G_new.add(h)
                else:
                    G_new.add(g)
            G = G_new

        # Prune S and G by removing overly general/specific hypotheses
        S = {s for s in S if not any(all(s_i == '?' or s_i == s2_i for s_i, s2_i in zip(s, s2)) and s != s2 for s2 in S)}
        G = {g for g in G if not any(all(g2_i == '?' or g2_i == g_i for g_i, g2_i in zip(g, g2)) and g != g2 for g2 in G)}

        print(f"S{i+1}: {S}")
        print(f"G{i+1}: {G}")
        print("-" * 30)

    return S, G

# --- Main execution ---
with open('enjoysport.csv', 'r') as f:
    reader = csv.reader(f)
    data = list(reader)
    # Exclude the header row for processing
    training_examples = data[1:]

final_S, final_G = candidate_elimination(training_examples)

print("-----------------------------------------")
print(f"Final Specific Boundary (S): {final_S}")
print(f"Final General Boundary (G): {final_G}")
print("-----------------------------------------")

Initial S: {('0', '0', '0', '0', '0', '0')}
Initial G: {('?', '?', '?', '?', '?', '?')}
------------------------------
Processing Example 1: ['Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same', 'Yes']
S1: {('Sunny', 'Warm', 'Normal', 'Strong', 'Warm', 'Same')}
G1: {('?', '?', '?', '?', '?', '?')}
------------------------------
Processing Example 2: ['Sunny', 'Warm', 'High', 'Strong', 'Warm', 'Same', 'Yes']
S2: {('Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same')}
G2: {('?', '?', '?', '?', '?', '?')}
------------------------------
Processing Example 3: ['Rainy', 'Cold', 'High', 'Strong', 'Warm', 'Change', 'No']
S3: {('Sunny', 'Warm', '?', 'Strong', 'Warm', 'Same')}
G3: {('?', 'Warm', '?', '?', '?', '?'), ('Sunny', '?', '?', '?', '?', '?'), ('?', '?', '?', '?', '?', 'Same')}
------------------------------
Processing Example 4: ['Sunny', 'Warm', 'High', 'Strong', 'Cool', 'Change', 'Yes']
S4: {('Sunny', 'Warm', '?', 'Strong', '?', '?')}
G4: {('?', 'Warm', '?', '?', '?', '?'), ('Sunny', '?', 