Exp - 3


In [1]:
import math
import csv

def load_csv(filename):
    lines = csv.reader(open(filename, "r"))
    dataset = list(lines)
    headers = dataset.pop(0)
    return dataset, headers

class Node:
    def __init__(self, attribute):
        self.attribute = attribute
        self.children = []
        self.answer = ""

def subtables(data, col, delete):
    dic = {}
    coldata = [row[col] for row in data]
    attr = list(set(coldata))
    counts = [0] * len(attr)
    for x in range(len(attr)):
        counts[x] = coldata.count(attr[x])
        dic[attr[x]] = [row for row in data if row[col] == attr[x]]
        if delete:
            for row in dic[attr[x]]:
                del row[col]
    return attr, dic

def entropy(S):
    attr = list(set(S))
    if len(attr) == 1:
        return 0
    counts = [S.count(a) / len(S) for a in attr]
    return sum([-cnt * math.log(cnt, 2) for cnt in counts])

def compute_gain(data, col):
    attr, dic = subtables(data, col, delete=False)
    total_size = len(data)
    total_entropy = entropy([row[-1] for row in data])
    attr_entropy = sum((len(dic[a]) / total_size) * entropy([row[-1] for row in dic[a]]) for a in attr)
    return total_entropy - attr_entropy

def build_tree(data, features):
    lastcol = [row[-1] for row in data]
    if len(set(lastcol)) == 1:
        node = Node("")
        node.answer = lastcol[0]
        return node
    gains = [compute_gain(data, col) for col in range(len(features))]
    split = gains.index(max(gains))
    node = Node(features[split])
    features = features[:split] + features[split + 1:]
    attr, dic = subtables(data, split, delete=True)
    for x in attr:
        child = build_tree(dic[x], features)
        node.children.append((x, child))
    return node

def print_tree(node, level=0):
    if node.answer:
        print(" " * level, node.answer)
        return
    print(" " * level, node.attribute)
    for value, n in node.children:
        print(" " * (level + 1), value)
        print_tree(n, level + 2)

def classify(node, x_test, features):
    if node.answer:
        print(node.answer)
        return
    pos = features.index(node.attribute)
    for value, n in node.children:
        if x_test[pos] == value:
            classify(n, x_test, features)

dataset, features = load_csv("id3.csv")
node1 = build_tree(dataset, features)
print("The decision tree for the dataset using ID3 algorithm is")
print_tree(node1)

testdata, features = load_csv("id3_test_1.csv")
for xtest in testdata:
    print("The test instance:", xtest)
    print("The label for test instance:", end=" ")
    classify(node1, xtest, features)


FileNotFoundError: [Errno 2] No such file or directory: 'id3.csv'

Exp - 5

In [2]:
import csv
import random
import math

def load_csv(filename):
    lines = csv.reader(open(filename, "r"))
    dataset = list(lines)
    for i in range(len(dataset)):
        dataset[i] = [float(x) for x in dataset[i]]
    return dataset

def split_dataset(dataset, split_ratio):
    train_size = int(len(dataset) * split_ratio)
    train_set = []
    copy = list(dataset)
    while len(train_set) < train_size:
        index = random.randrange(len(copy))
        train_set.append(copy.pop(index))
    return [train_set, copy]

def separate_by_class(dataset):
    separated = {}
    for i in range(len(dataset)):
        vector = dataset[i]
        if vector[-1] not in separated:
            separated[vector[-1]] = []
        separated[vector[-1]].append(vector)
    return separated

def mean(numbers):
    return sum(numbers) / float(len(numbers))

def stdev(numbers):
    avg = mean(numbers)
    variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
    return math.sqrt(variance)

def summarize(dataset):
    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]
    del summaries[-1]
    return summaries

def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = {}
    for class_value, instances in separated.items():
        summaries[class_value] = summarize(instances)
    return summaries

def calculate_probability(x, mean, stdev):
    exponent = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent

def calculate_class_probabilities(summaries, input_vector):
    probabilities = {}
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = 1
        for i in range(len(class_summaries)):
            mean, stdev = class_summaries[i]
            x = input_vector[i]
            probabilities[class_value] *= calculate_probability(x, mean, stdev)
    return probabilities

def predict(summaries, input_vector):
    probabilities = calculate_class_probabilities(summaries, input_vector)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

def get_predictions(summaries, test_set):
    predictions = []
    for i in range(len(test_set)):
        result = predict(summaries, test_set[i])
        predictions.append(result)
    return predictions

def get_accuracy(test_set, predictions):
    correct = 0
    for i in range(len(test_set)):
        if test_set[i][-1] == predictions[i]:
            correct += 1
    return (correct / float(len(test_set))) * 100.0

def main():
    filename = 'naivedata.csv'
    split_ratio = 0.67
    dataset = load_csv(filename)
    training_set, test_set = split_dataset(dataset, split_ratio)
    print(f'Split {len(dataset)} rows into train={len(training_set)} and test={len(test_set)} rows')

    summaries = summarize_by_class(training_set)
    predictions = get_predictions(summaries, test_set)
    accuracy = get_accuracy(test_set, predictions)
    print(f'Accuracy of the classifier is: {accuracy}%')

main()


FileNotFoundError: [Errno 2] No such file or directory: 'naivedata.csv'

Exp - 1

In [3]:
import pandas as pd

df = pd.read_csv('enjoysport.csv')

def total_training_instances(df):
    return len(df)

total_instances = total_training_instances(df)
print("The total number of training instances are:", total_instances)

def initial_hypothesis(df):
    return ['0'] * (len(df.columns) - 1)

init_hypothesis = initial_hypothesis(df)
print("The initial hypothesis is:", init_hypothesis)

def hypothesis_for_instances(df):
    hypotheses = []
    specific_hypothesis = initial_hypothesis(df)
    for _, row in df.iterrows():
        if row['enjoysport'] == 'yes':
            for i in range(len(df.columns) - 1):
                if specific_hypothesis[i] == '0':
                    specific_hypothesis[i] = row[i]
                elif specific_hypothesis[i] != row[i]:
                    specific_hypothesis[i] = '?'
            hypotheses.append(specific_hypothesis.copy())
    return hypotheses

hypotheses = hypothesis_for_instances(df)
for idx, h in enumerate(hypotheses, start=1):
    print(f"The hypothesis for the training instance {idx} is: {h}")

def find_s_algorithm(df):
    specific_hypothesis = initial_hypothesis(df)
    for _, row in df.iterrows():
        if row['enjoysport'] == 'yes':
            for i in range(len(df.columns) - 1):
                if specific_hypothesis[i] == '0':
                    specific_hypothesis[i] = row[i]
                elif specific_hypothesis[i] != row[i]:
                    specific_hypothesis[i] = '?'
    return specific_hypothesis

max_specific_hypothesis = find_s_algorithm(df)
print("The Maximally specific hypothesis for the training instances is:", max_specific_hypothesis)


FileNotFoundError: [Errno 2] No such file or directory: 'enjoysport.csv'