In [20]:
import numpy as np
from scipy.stats import itemfreq
import pandas as pd

In [21]:
def vectorize_terms(terms):
    terms = [term.lower() for term in terms]
    terms = [np.array(list(term)) for term in terms]
    terms = [np.array([ord(char) for char in term]) for term in terms]
    return terms

root = 'Believe'
term1 = 'Beleive'
term2 = 'bargain'
term3 = 'Elephant'
terms = [root, term1, term2, term3]

# Character Vectorization
term_vectors = vectorize_terms(terms)
# Show vector representations
vec_df = pd.DataFrame(term_vectors, index=terms)


root_term = root
other_terms = [term1, term2, term3]

root_term_vec = vec_df[vec_df.index==root_term].dropna(axis=1).values[0]
other_term_vecs=[vec_df[vec_df.index==term].dropna(axis=1).values[0] for term in other_terms]

In [22]:
def boc_term_vectors(word_list):
    word_list = [word.lower() for word in word_list]
    unique_chars = np.unique(
                        np.hstack([list(word) 
                        for word in word_list]))
    word_list_term_counts = [{char: count 
                                  for char, count in np.stack(
                                                         np.unique(list(word), 
                                                                   return_counts=True),
                                                         axis=1)}
                                 for word in word_list]
    
    boc_vectors = [np.array([int(word_term_counts.get(char, 0)) 
                            for char in unique_chars])
                   for word_term_counts in word_list_term_counts]
    return list(unique_chars), boc_vectors

In [24]:
# Bag of characters vectorization

feature_names, feature_vectors = boc_term_vectors(terms)
boc_df = pd.DataFrame(feature_vectors, columns=feature_names, index=terms)
print(boc_df)

          a  b  e  g  h  i  l  n  p  r  t  v
Believe   0  1  3  0  0  1  1  0  0  0  0  1
Beleive   0  1  3  0  0  1  1  0  0  0  0  1
bargain   2  1  0  1  0  1  0  1  0  1  0  0
Elephant  1  0  2  0  1  0  1  1  1  0  1  0


In [25]:
root_term_boc = boc_df[vec_df.index == root_term].values[0]
other_term_bocs = [boc_df[vec_df.index == term]
                   .values[0]
                      for term in other_terms]

In [26]:
def cosine_distance(u, v):
    distance = 1.0 - (np.dot(u, v) / 
                        (np.sqrt(sum(np.square(u))) * np.sqrt(sum(np.square(v))))
                     )
    return distance

In [27]:
for term, boc_term in zip(other_terms, other_term_bocs):
    print('Analyzing similarity between root: {} and term: {}'.format(root_term,
                                                                      term))
    distance = round(cosine_distance(root_term_boc, boc_term), 2)
    similarity = round(1 - distance, 2)                                                           
    print('Cosine distance  is {}'.format(distance))
    print('Cosine similarity  is {}'.format(similarity))
    print('-'*40)

Analyzing similarity between root: Believe and term: Beleive
Cosine distance  is -0.0
Cosine similarity  is 1.0
----------------------------------------
Analyzing similarity between root: Believe and term: bargain
Cosine distance  is 0.82
Cosine similarity  is 0.18
----------------------------------------
Analyzing similarity between root: Believe and term: Elephant
Cosine distance  is 0.39
Cosine similarity  is 0.61
----------------------------------------
