<a href="https://colab.research.google.com/github/zikili/Python-AI-Algorithms/blob/main/Matala2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import collections

class NaiveBayes:
    def __init__(self):
        self.classes = []
        self.attribute_values = collections.defaultdict(set)
        self.training_data = []
        self.atts = None
        self.probabilities = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(float)))
        self.smoothing_factor = 1.0

    def parse_examples(self, examples):
        lines = examples.split("\n")
        attribute_names = lines[0].split()
        parsed_examples = []
        for i in range(1, len(lines)):
            attributes = lines[i].split()
            parsed_examples.append(attributes)

        parsed_data = {
            "parsed_examples": parsed_examples,
            "attributes": attribute_names
        }
        return parsed_data

    def train(self, input_data, attribute_names):
        for instance in input_data:
            classification = instance[-1]
            if classification not in self.classes:
                self.classes.append(classification)

            for i in range(len(instance)):
                attribute_name = attribute_names[i]
                attribute_value = instance[i]
                self.attribute_values[attribute_name].add(attribute_value)

            self.training_data.append(instance)

        self.atts = attribute_names

        # Calculate probabilities with smoothing
        for classification in self.classes:
            for attribute_name in self.attribute_values:
                for attribute_value in self.attribute_values[attribute_name]:
                    count = self.count_occurrences(self.training_data, attribute_name, attribute_value, classification)
                    total = self.count_total_occurrences(self.training_data, attribute_name, attribute_value)
                    probability = (count + self.smoothing_factor) / (total + self.smoothing_factor * len(self.attribute_values[attribute_name]))
                    self.probabilities[classification][attribute_name][attribute_value] = probability

    def predict(self, example):
        max_probability = -1
        predicted_class = ""
        for classification in self.classes:
            class_probability = self.get_class_probability(example, classification)
            if class_probability > max_probability:
                max_probability = class_probability
                predicted_class = classification
        return predicted_class

    def get_class_probability(self, example, classification):
        probability = 1
        for i in range(len(example)):
            attribute_name = self.atts[i]
            attribute_value = example[i]
            attribute_probability = self.probabilities[classification][attribute_name][attribute_value]
            probability *= attribute_probability
        return probability

    def count_occurrences(self, data, attribute_name, attribute_value, classification):
        count = 0
        for instance in data:
            if instance[-1] == classification and instance[self.get_index(attribute_name)] == attribute_value:
                count += 1
        return count

    def count_total_occurrences(self, data, attribute_name, attribute_value):
        count = 0
        for instance in data:
            if instance[self.get_index(attribute_name)] == attribute_value:
                count += 1
        return count

    def get_index(self, attribute):
        return self.atts.index(attribute)



In [None]:
import math
import re


class ID3:
    def __init__(self):
        self.atts = None
        self.finalAtt = None  # classifier
        self.map = {}  # map between class and values

    def parseExamples(self, examples):
        lines = examples.split("\n")
        parts = re.split(r'\s+', lines[0])
        self.finalAtt = parts[-1]
        attributeNames = parts[:-1]
        parsedExamples = []
        for i in range(1, len(lines)-1):
            attributes = re.split(r'\s+', lines[i])
            parsedExamples.append(attributes)

        parsedData = {}
        parsedData["parsedExamples"] = parsedExamples
        parsedData["attributes"] = attributeNames
        return parsedData

    class Node:
        def __init__(self, attribute):
            self.attribute = attribute
            self.children = {}
            self.classification = None

    def buildDecisionTree(self, examples, attributes, defValue):
        if len(examples) == 0:  # ran out of examples and still not sure about classification
            leaf = self.Node(self.finalAtt)
            leaf.classification = defValue
            return leaf
        majorityClass = self.findMajorityClass(examples)
        if self.allExamplesHaveSameClassification(examples, majorityClass.lower()):
            leaf = self.Node(self.finalAtt)
            leaf.classification = majorityClass
            return leaf
        if len(attributes) == 0:
            leaf = self.Node(self.finalAtt)
            leaf.classification = majorityClass
            return leaf
        bestAttribute = self.findBestAttribute(examples, attributes)
        root = self.Node(bestAttribute)
        for attributeValue in self.map[bestAttribute]:
            subset = self.getSubset(examples, bestAttribute, attributeValue)
            child = self.buildDecisionTree(subset, self.removeAttribute(attributes, bestAttribute), majorityClass)
            root.children[attributeValue] = child
        return root

    def findMajorityClass(self, examples):
        positiveCount = 0
        negativeCount = 0
        for example in examples:
            if example[-1] == "yes":
                positiveCount += 1
            else:
                negativeCount += 1
        return "yes" if positiveCount >= negativeCount else "no"

    def allExamplesHaveSameClassification(self, examples, classification):
        for example in examples:
            if example[-1] != classification:
                return False
        return True

    def findBestAttribute(self, examples, attributes):
        maxGain = -1
        bestAttribute = None
        for attribute in attributes:
            gain = self.calculateInformationGain(examples, attribute)
            if gain > maxGain:
                maxGain = gain
                bestAttribute = attribute
        return bestAttribute

    def calculateInformationGain(self, examples, attribute):
        entropy = self.calculateEntropy(examples)
        gain = entropy
        subsets = self.getSubsets(examples, attribute)
        for subset in subsets.values():
            subsetEntropy = self.calculateEntropy(subset)
            proportion = len(subset) / len(examples)
            gain -= proportion * subsetEntropy
        return gain

    def calculateEntropy(self, examples):
        positiveCount = 0
        negativeCount = 0
        for example in examples[:-1]:
            if example[-1] == "yes":
                positiveCount += 1
            else:
                negativeCount += 1
        positiveProportion = positiveCount / len(examples)
        negativeProportion = negativeCount / len(examples)
        if positiveProportion == 0 or negativeProportion == 0:
            return 0
        return -positiveProportion * math.log(positiveProportion) - negativeProportion * math.log(negativeProportion)

    def getSubsets(self, examples, attribute):
        subsets = {}
        for example in examples[:-1]:
            attributeValue = example[self.getIndexShort(attribute)]
            if attributeValue not in subsets:
                subsets[attributeValue] = []
            subsets[attributeValue].append(example)
        return subsets

    def getIndexShort(self, attribute):
        return self.getAttributes().index(attribute)

    def getAttributeValues(self, examples, attribute):
        attributeValues = []
        index = self.getIndexShort(attribute)
        for example in examples[:-1]:
            value = example[index]
            if value not in attributeValues:
                attributeValues.append(value)
        return attributeValues

    def getSubset(self, examples, attribute, value):
        subset = []
        index = self.getIndexShort(attribute)
        for example in examples[:-1]:
            if example[index] == value:
                subset.append(example.copy())  # Make a copy to avoid modifying the original example
        return subset

    def removeAttribute(self, attributes, attributeToRemove):
        remainingAttributes = attributes.copy()
        remainingAttributes.remove(attributeToRemove)
        return remainingAttributes

    def printTree(self, root, depth):
        if root is None:
            return
        sb = "  " * depth
        if root.classification is not None:
            #print(sb + root.attribute + "=" + root.classification)
            return
        for attributeValue, child in root.children.items():
            #print(sb + root.attribute + "=" + attributeValue)
            self.printTree(child, depth + 1)

    def getIndex(self, attribute, attributes):
        for i in range(len(attributes)):
            if attributes[i] == attribute:
                return i
        return -1  # attribute not found

    def classify(self, root, instance, attributes):
        if root.classification is not None:
            return root.classification
        attributeValue = instance[self.getIndexShort(root.attribute)]
        child = root.children.get(attributeValue)
        if child is None:
            return root.classification
        return self.classify(child, instance, attributes)

    def testClassification(self, decisionTree, attributes, instance):
        classification = self.classify(decisionTree, instance, attributes)
        print("Instance: " + str(instance))
        print("Classification: " + classification)
        return classification

    def getAttributes(self):
        return self.atts

    def printTreeToFile(self, root, depth, fileName):
        with open(fileName, "w") as writer:
            self.writeTreeToFile(root, depth, "", writer)

    def writeTreeToFile(self, root, depth, prefix, writer):
        if root is None:
            return
        sb = "  " * depth
        if root.classification is not None:
            writer.write(sb + prefix + root.attribute + "=" + root.classification)
            writer.write("\n")
            return
        for attributeValue, child in root.children.items():
            writer.write(sb + prefix + root.attribute + "=" + attributeValue)
            writer.write("\n")
            self.writeTreeToFile(child, depth + 1, "|", writer)




In [None]:

import os

def NaiveBayesMethod():
    naiveBayes = NaiveBayes()
    file_path = "train.txt"
    with open(file_path, "r") as file:
        examples = file.read()

    parsed_data = naiveBayes.parse_examples(examples)
    input_data = parsed_data["parsed_examples"]
    attribute_names = parsed_data["attributes"]

    naiveBayes.atts = attribute_names[:-1]

    naiveBayes.train(input_data[:-1], attribute_names)

    test_path = "test.txt"
    with open(test_path, "r") as test_file:
        tests = test_file.read()

    parsed_test = naiveBayes.parse_examples(tests)
    parsed_tests = parsed_test["parsed_examples"]
    count_right_class = 0
    res = []
    for s in parsed_tests:
        copy_atts = s[:-1]
        pred = naiveBayes.predict(copy_atts)
        res.append(pred)
        if pred == s[-1]:
            count_right_class += 1

    accuracy = count_right_class / len(parsed_tests)
    #print(accuracy)
    res.append(str(accuracy))
    return res

def id3Method():

    id3 = ID3()
    file_path = "train.txt"
    with open(file_path, "r") as file:
        examples = file.read()

    parsed_data = id3.parseExamples(examples)
    parsed_examples = parsed_data["parsedExamples"]
    attributes = parsed_data["attributes"]
    id3.atts = attributes.copy()
    for attr in attributes:
        id3.map[attr] = id3.getAttributeValues(parsed_examples, attr)

    decision_tree = id3.buildDecisionTree(parsed_examples, attributes, "Yes")

    test_path = "test.txt"
    with open(test_path, "r") as test_file:
        tests = test_file.read()

    parsed_test = id3.parseExamples(tests)
    parsed_tests = parsed_test["parsedExamples"]
    print (parsed_tests)
    res = []
    count = 0
    for s in parsed_tests:
        print(s)
        classification = id3.testClassification(decision_tree, id3.atts, s)
        if classification == s[-1]:
            count += 1
        res.append(classification)
    res.append(str(count / len(parsed_tests)))
    id3.printTree(decision_tree, 0)
    id3.printTreeToFile(decision_tree, 0, "output_tree.txt")
    return res


id3_list = id3Method()
nb_list=NaiveBayesMethod()
file_path = "output.txt"
with open(file_path, "w") as file:
    file.write("Naive Bayes\n")
    for i in range(len(nb_list) - 1):
        file.write(f"\t|{nb_list[i]}\n")
    file.write(f"Accuracy: {nb_list[-1]}\n\n")
    file.write("ID3\n")
    for i in range(len(id3_list) - 1):
        file.write(f"\t|{id3_list[i]}\n")
    file.write(f"Accuracy: {id3_list[-1]}\n")





[['crew', 'adult', 'male', 'no'], ['crew', 'adult', 'male', 'yes'], ['crew', 'adult', 'male', 'no'], ['3rd', 'adult', 'male', 'no'], ['2nd', 'adult', 'male', 'no'], ['crew', 'adult', 'male', 'no'], ['2nd', 'adult', 'male', 'no'], ['crew', 'adult', 'male', 'yes'], ['crew', 'adult', 'male', 'no'], ['crew', 'adult', 'male', 'no'], ['3rd', 'adult', 'male', 'no'], ['3rd', 'adult', 'male', 'no'], ['3rd', 'adult', 'male', 'no'], ['crew', 'adult', 'male', 'no'], ['crew', 'adult', 'male', 'no'], ['3rd', 'adult', 'female', 'yes'], ['crew', 'adult', 'male', 'no'], ['1st', 'adult', 'female', 'yes'], ['3rd', 'adult', 'female', 'no'], ['crew', 'adult', 'male', 'no'], ['3rd', 'adult', 'female', 'no'], ['2nd', 'adult', 'female', 'yes'], ['3rd', 'adult', 'male', 'no']]
['crew', 'adult', 'male', 'no']
Instance: ['crew', 'adult', 'male', 'no']
Classification: no
['crew', 'adult', 'male', 'yes']
Instance: ['crew', 'adult', 'male', 'yes']
Classification: no
['crew', 'adult', 'male', 'no']
Instance: ['crew'