<a href="https://colab.research.google.com/github/vinay2k2/DataScienceToolBox/blob/main/OneHotEncoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# One Hot Encoding
# Author Vinay Kumar

# Example sentences
sentence1 = "I like apples"
sentence2 = "She likes bananas"
sentence3 = "We eat oranges"

# Tokenize each sentence into words
words_sentence1 = sentence1.split()
words_sentence2 = sentence2.split()
words_sentence3 = sentence3.split()

# Collect unique words to form the vocabulary
vocabulary = set(words_sentence1 + words_sentence2 + words_sentence3)

# Convert the set to a list if needed
vocabulary_list = list(vocabulary)

# One-hot encoding function
def one_hot_encoding(sentence, vocab):
    """
        Perform one-hot encoding for a given sentence based on a vocabulary.

        Parameters:
        - sentence (str): The input sentence to be encoded.
        - vocab (list): The vocabulary list.

        Returns:
        - list: One-hot encoded vector where each element corresponds to the presence (1) or absence (0)
                of a word from the vocabulary in the given sentence.
        """
    # Tokenize the sentence into words
    words = sentence.split()

    # Initialize the one-hot encoded vector with zeros
    encoding = [0] * len(vocab)

    # Iterate over each word in the sentence
    for word in words:
        # Check if the word is in the vocabulary
        if word in vocab:
            # If the word is in the vocabulary, set the corresponding position to 1
            encoding[vocab.index(word)] = 1

    # Return the one-hot encoded vector for the given sentence
    return encoding

# One-hot encode the sentences
encoding1 = one_hot_encoding(sentence1, vocabulary_list)
encoding2 = one_hot_encoding(sentence2, vocabulary_list)
encoding3 = one_hot_encoding(sentence3, vocabulary_list)

# Print the results
print(f"Vocabulary list {vocabulary_list}")
print(f"One-hot encoding for '{sentence1}': {encoding1}")
print(f"One-hot encoding for '{sentence2}': {encoding2}")
print(f"One-hot encoding for '{sentence3}': {encoding3}")

import pandas as pd
# Create a DataFrame
data = {
    'Word': vocabulary_list,
    sentence1: encoding1,
    sentence2: encoding2,
    sentence3: encoding3
}

df = pd.DataFrame(data).set_index('Word').transpose()

# Print the DataFrame
print(df)

import unittest


class TestOneHotEncoding(unittest.TestCase):

    def setUp(self):
        # Common vocabulary for testing
        self.vocab = ["I", "like", "apples", "She", "likes", "bananas", "We", "eat", "oranges"]

    def test_basic_example(self):
        sentence = "I like apples"
        result = one_hot_encoding(sentence, self.vocab)
        self.assertEqual(result, [1, 1, 1, 0, 0, 0, 0, 0, 0])

    def test_words_not_in_vocab(self):
        sentence = "He enjoys grapes"
        result = one_hot_encoding(sentence, self.vocab)
        self.assertEqual(result, [0, 0, 0, 0, 0, 0, 0, 0, 0])

    def test_empty_sentence(self):
        sentence = ""
        result = one_hot_encoding(sentence, self.vocab)
        self.assertEqual(result, [0, 0, 0, 0, 0, 0, 0, 0, 0])

    # Add more test cases as needed

if __name__ == '__main__':
    unittest.main()