# One-Hot-Encoding

In [20]:
import spacy
from sklearn.feature_extraction.text import CountVectorizer
nlp = spacy.load('en_core_web_sm')

text="Jim loves NLP. He will learn NLP in two monts. NLP is future."
def get_spacy_tokens(text):
    doc = nlp(text)
    return [token.text for token in doc]
text_tokens = get_spacy_tokens(text)
vectorizer = CountVectorizer(tokenizer=get_spacy_tokens, lowercase=False, token_pattern=None)

vectorizer.fit(text_tokens)
print("Vocabulary: ", vectorizer.vocabulary_)

vector = vectorizer.transform(text_tokens)
print("Encoded Document is:") 
print(vector.toarray()) 

Vocabulary:  {'Jim': 2, 'loves': 8, 'NLP': 3, '.': 0, 'He': 1, 'will': 11, 'learn': 7, 'in': 5, 'two': 10, 'monts': 9, 'is': 6, 'future': 4}
Encoded Document is:
[[0 0 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 1 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0]]


## Products

In [21]:
import pandas as pd
df = pd.read_csv('products.csv')
df = pd.get_dummies(df, columns=["kategorie"], dtype=int)
df.columns = map(str.lower, df.columns)
df

Unnamed: 0,produkt_id,kategorie_bücher,kategorie_elektronik,kategorie_kleidung
0,1,0,1,0
1,2,0,0,1
2,3,1,0,0
3,4,0,1,0
4,5,0,1,0
5,6,1,0,0
6,7,0,0,1
7,8,1,0,0
8,9,0,1,0
9,10,0,0,1


# Count Vectorizer

In [22]:
text = ["Tom's family includes 5 kids 2 dogs and 1 cat",
        "The dogs are friendly. The cat is beautiful",
        "The dog is 11 years old",
        "Tom loves in the United States of America"]

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
vectorizer = CountVectorizer()
vectorizer.fit(text)

In [25]:
print(vectorizer.get_feature_names_out())

['11' 'america' 'and' 'are' 'beautiful' 'cat' 'dog' 'dogs' 'family'
 'friendly' 'in' 'includes' 'is' 'kids' 'loves' 'of' 'old' 'states' 'the'
 'tom' 'united' 'years']


In [26]:
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b')
vectorizer.fit(text)
print(vectorizer.get_feature_names_out())

['1' '11' '2' '5' 'america' 'and' 'are' 'beautiful' 'cat' 'dog' 'dogs'
 'family' 'friendly' 'in' 'includes' 'is' 'kids' 'loves' 'of' 'old' 's'
 'states' 'the' 'tom' 'united' 'years']


In [27]:
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b',
                             stop_words=['is','are','and','in','the'])
vectorizer.fit(text)
print(vectorizer.get_feature_names_out())

['1' '11' '2' '5' 'america' 'beautiful' 'cat' 'dog' 'dogs' 'family'
 'friendly' 'includes' 'kids' 'loves' 'of' 'old' 's' 'states' 'tom'
 'united' 'years']


In [28]:
print(vectorizer.vocabulary_)

{'tom': 18, 's': 16, 'family': 9, 'includes': 11, '5': 3, 'kids': 12, '2': 2, 'dogs': 8, '1': 0, 'cat': 6, 'friendly': 10, 'beautiful': 5, 'dog': 7, '11': 1, 'years': 20, 'old': 15, 'loves': 13, 'united': 19, 'states': 17, 'of': 14, 'america': 4}


In [29]:
vector = vectorizer.transform(text)
vector.shape

(4, 21)

In [30]:
print(vector.toarray())

[[1 0 1 1 0 0 1 0 1 1 0 1 1 0 0 0 1 0 1 0 0]
 [0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 0]]


In [31]:
import pandas as pd

In [32]:
pd.DataFrame(vector.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,1,11,2,5,america,beautiful,cat,dog,dogs,family,...,includes,kids,loves,of,old,s,states,tom,united,years
0,1,0,1,1,0,0,1,0,1,1,...,1,1,0,0,0,1,0,1,0,0
1,0,0,0,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0,0,0,0,1,0,0,0,0,0,...,0,0,1,1,0,0,1,1,1,0


In [33]:
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b',
                             stop_words=['is','are','and','in','the'],
                             ngram_range=(1,4))
vectorizer.fit(text)
print(vectorizer.get_feature_names_out())

['1' '1 cat' '11' '11 years' '11 years old' '2' '2 dogs' '2 dogs 1'
 '2 dogs 1 cat' '5' '5 kids' '5 kids 2' '5 kids 2 dogs' 'america'
 'beautiful' 'cat' 'cat beautiful' 'dog' 'dog 11' 'dog 11 years'
 'dog 11 years old' 'dogs' 'dogs 1' 'dogs 1 cat' 'dogs friendly'
 'dogs friendly cat' 'dogs friendly cat beautiful' 'family'
 'family includes' 'family includes 5' 'family includes 5 kids' 'friendly'
 'friendly cat' 'friendly cat beautiful' 'includes' 'includes 5'
 'includes 5 kids' 'includes 5 kids 2' 'kids' 'kids 2' 'kids 2 dogs'
 'kids 2 dogs 1' 'loves' 'loves united' 'loves united states'
 'loves united states of' 'of' 'of america' 'old' 's' 's family'
 's family includes' 's family includes 5' 'states' 'states of'
 'states of america' 'tom' 'tom loves' 'tom loves united'
 'tom loves united states' 'tom s' 'tom s family' 'tom s family includes'
 'united' 'united states' 'united states of' 'united states of america'
 'years' 'years old']


In [34]:
vectorizer = CountVectorizer(token_pattern=u'(?u)\\b\\w+\\b',
                             stop_words=['is','are','and','in','the'],
                             ngram_range=(1,4),
                             vocabulary=['united states of america'])
vectorizer.fit(text)
print(vectorizer.get_feature_names_out())

['united states of america']
