In [1]:
from chatterbot.logic import LogicAdapter
import numpy as np
import math
import pandas as pd
import string
import json
from gensim import corpora, models, similarities
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.similarities import Similarity
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from gensim.models import LdaModel
from stop_words import get_stop_words
from pandas import DataFrame
from difflib import SequenceMatcher


In [2]:
SOME_FIXED_SEED = 43
np.random.seed(SOME_FIXED_SEED)

In [3]:
def euclidean_distance(x,y):
    return math.sqrt(sum(pow(a-b,2) for a, b in zip(x, y)))
 
def manhattan_distance(x,y):
    return sum(abs(a-b) for a,b in zip(x,y))
 
def jaccard_similarity(x,y):
    intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
    union_cardinality = len(set.union(*[set(x), set(y)]))
    return intersection_cardinality/float(union_cardinality)

def cosine(vector1, vector2):
    tmp = np.linalg.norm(vector1) * np.linalg.norm(vector2)
    if tmp == 0:
        return 0
    else:
        return float(np.dot(vector1,vector2) / tmp)
    
def KL(a, b):
    a = np.asarray(a, dtype=np.float)
    b = np.asarray(b, dtype=np.float)
    return np.sum(np.where(a != 0, a * np.log(a / b), 0))

In [4]:
def load_statement():
    statements = pd.read_csv('../data/statements.csv', header=None, encoding = "ISO-8859-1")
#     statements = pd.read_csv('../data/sample.csv', header=None, encoding = "ISO-8859-1")
    return statements[1]

In [5]:
def load_data():
    with open("../data/consumer_credit_data.json",'r') as load_f:
        load_dict = json.load(load_f, encoding='utf-8')
        id_list = []
        data_list = []

        for item in load_dict['questions']:
            id_list.append(item['_id'])
            data_list.append(item['label'] + ' ' +item['externalComment']+ ' ' + item.get('tip', '')+ ' ' + item.get('internalComment', ''))

        for item in load_dict['licences']:
            if not(item.get('externalComment', '').find('This activity is exempted. You do not need to be authorised.')):
                id_list.append(item['_id'])
                data_list.append(item['label'] + ' ' + item.get('externalComment', ''))
                
    return [data_list, id_list]

In [6]:
def splitWordByLibrary(documents):
    texts = []
    tokenizer = RegexpTokenizer(r'\w+')
    en_stop = get_stop_words('en')
    p_stemmer = PorterStemmer()

    for i in documents:
        # clean and tokenize document string
        raw = i.lower()
        tokens = tokenizer.tokenize(raw)
     
        # remove stop words from tokens
        stopped_tokens = [i for i in tokens if not i in en_stop]
        
        # stem tokens
        stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
        
        # add tokens to list
        texts.append(stemmed_tokens)
    return texts


In [7]:
def splitWord(documents):
    #split the sentence into word and remove the stop word
    texts = []
    stoplist=set('for a of the and to in at after with do i was am an Do its so need on if be were are is who we fca'.split())  
    for document in documents:
        document = document.translate(str.maketrans('','',string.punctuation))
        tmp = []
        for word in document.lower().split():
            if word not in stoplist:
                tmp.append(word)
        texts.append(tmp)
    return texts

In [8]:
def lda_model(statements, data):
#     statements = load_statement()
    lda_result = []
    lda_id = []
    id_list = data[1]
    tag_list= data[0]
    
    texts = splitWordByLibrary(tag_list)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = LdaModel(corpus, num_topics=len(corpus), id2word = dictionary) 
    corpus_len = len(corpus)
    statements = splitWordByLibrary(statements) 
    for statement in statements:
        cos_sim_list = []
#         statement = statement.translate(str.maketrans('','',string.punctuation))
        for i in range(0, corpus_len):
            new_vec = dictionary.doc2bow(statement)
            dict1 = dict(ldamodel[new_vec])
            dict2 = dict(ldamodel[corpus[i]])
            vec2 = np.zeros(corpus_len)
            vec1 = np.zeros(corpus_len)
            for a in dict2:
                vec2[a] = dict2[a]
            for a in dict1:
                vec1[a] = dict1[a]
            cos_sim_list.append(cosine(vec1, vec2))
        largest_index = np.argmax(cos_sim_list)
        lda_result.append(tag_list[largest_index])
        lda_id.append(id_list[largest_index])
    return [lda_result, lda_id]

In [9]:
def difflib_model(statements, data):
    difflib_result = []
    difflib_id = []
    id_list = data[1]
    tag_list= data[0]
    tags = splitWordByLibrary(tag_list)
    statements = splitWordByLibrary(statements)
    for statement in statements:
        largest_similarity = 0
        largest_index = 0
        for i in range(0, len(tags)):
            similarity = SequenceMatcher(
                None,
                "".join(statement),
                "".join(tags[i])
            )
            if similarity.ratio() > largest_similarity:
                largest_similarity = similarity.ratio()
                largest_index = i
                
        difflib_id.append(id_list[largest_index])
        difflib_result.append(tag_list[largest_index])
    return [difflib_result, difflib_id]


In [10]:
def tfidf_model(statements, data):
    #create dummy data 
    
    tfidf_result = []
    tfidf_id = []
    id_list = data[1]
    tag_list= data[0]
    tags = splitWordByLibrary(tag_list)
    dictionary = corpora.Dictionary(tags)
    corpus = [dictionary.doc2bow(tag) for tag in tags]
    corpus_len = len(dictionary)
    tfidf = models.TfidfModel(corpus)
    index = similarities.MatrixSimilarity(tfidf[corpus], num_features=len(dictionary))
    statements = splitWordByLibrary(statements)
    for statement in statements:
        cos_sim_list = []
        euclidean_distance_list = []
        manhattan_distance_list = []
        jaccard_similarity_list = []
        word_count_list = []
        for i in range(0, len(tags)):
            new_vec = dictionary.doc2bow(statement)
            dict1 = dict(tfidf[new_vec])
            dict2 = dict(tfidf[corpus[i]])

            vec2 = np.zeros(corpus_len)
            vec1 = np.zeros(corpus_len)

            for a in dict2:
                vec2[a] = dict2[a]
            for a in dict1:
                vec1[a] = dict1[a]
                
            cos_sim_list.append(cosine(vec1, vec2))
            euclidean_distance_list.append(euclidean_distance(vec1, vec2))
            jaccard_similarity_list.append(jaccard_similarity(vec1, vec2))
            manhattan_distance_list.append(manhattan_distance(vec1, vec2))
            # word_count_list.append(len(dict1)+len(dict2))
        largest_index = np.argmax(cos_sim_list)
        tfidf_result.append(tag_list[largest_index])
        tfidf_id.append(id_list[largest_index])
    return [tfidf_result, tfidf_id]

In [11]:
def lsi_model(statements, data):
#     statements = load_statement()
    lsi_result = []
    lsi_id = []
    id_list = data[1]
    tag_list = data[0]

    tags = splitWordByLibrary(tag_list)
    dictionary = corpora.Dictionary(tags)
    corpus = [dictionary.doc2bow(tag) for tag in tags]
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=len(tag_list))
    index = similarities.MatrixSimilarity(lsi[corpus_tfidf]) 
    statements = splitWordByLibrary(statements)

    for statement in statements:
#         statement = statement.translate(str.maketrans('','',string.punctuation))
        test_statement = dictionary.doc2bow(statement)
        vec_lsi = lsi[test_statement]
        sims = index[tfidf[vec_lsi]]
        sims = sorted(enumerate(sims), key=lambda item: -item[1])
        largest_index = sims[0][0]
        lsi_result.append(tag_list[largest_index])
        lsi_id.append(id_list[largest_index])
    return [lsi_result, lsi_id]

In [12]:
def doc_model(statements, data):
    doc_result = []
    doc_id = []
    id_list = data[1]
    tag_list = data[0]
    texts = splitWordByLibrary(tag_list)
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts)]
    model = Doc2Vec(documents)
#     model = Doc2Vec(documents, vector_size=5, window=120, workers=4)
    statements = splitWordByLibrary(statements) 
    for statement in statements:
        infer_vector = model.infer_vector(statement)
        index = model.docvecs.most_similar([infer_vector], topn = 1)[0][0]
        doc_result.append(tag_list[index])
        doc_id.append(id_list[index])
        
    return [doc_result, doc_id]

In [16]:
data = load_data()
statements = load_statement()
tfidf_result, tfidf_id = tfidf_model(statements, data)
doc_result, doc_id = doc_model(statements, data)
lda_result, lda_id = lda_model(statements, data)
lsi_result, lsi_id = lsi_model(statements, data)
difflib_result, difflib_id = difflib_model(statements, data)

In [17]:
d = {'statements': np.array(statements), 
     'tiidf_result': np.array(tfidf_result), 
     'lsi_result': np.array(lsi_result),
     'lda_result': np.array(lda_result), 
#      'doc_result': np.array(doc_result),
     'difflib_resutl': np.array(doc_result), }
df=DataFrame(data = d, columns = ['statements', 'tiidf_result', 'lsi_result', 'lda_result', 'difflib_resutl'])

In [18]:
df.to_csv('../data/result/similarity.csv', index=False)

In [19]:
#create data for testing
create_data = [['Hello','Hi',
                'Greetings!','How is it going?',
                'How are you doing?','Nice to meet you.',
                'How do you do?','Hi, nice to meet you.',
                'It is a pleasure to meet you.','Top of the morning to you!',
                'Top of the morning to you!','what is good to eat?',
                'do you drink','are you experiencing an energy shortage?',
                'why can you not eat?','do you like being a chatterbot',
                'if you could eat food, what would you eat?','do you wish you could eat food?',
                'can a robot get drunk?','i like wine, do you?',
                'what do robots need to survive?','will robots ever be able to eat?',
                'do you want to go to FinTech page', 'do you want to go to RegTech page',
                'What are your interests','What are your favorite subjects',
                'What is your number','What is your favorite number',
                'What is your location', 'Where do you live',
                'Where are you from', 'Where are you',
                'Do you have any brothers','Do you have any brothers',
                'Who is your father','Who is your mother',
                'Who is your boss','What is your age'
               ],
               ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
                '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22',
                '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33',
                '34', '35', '36', '37', '38']
              ]



create_satatements = ['Hello bot','Hi bot',
                      'Greeting','How is it going?',
                      'How are you', 'Nice to meet you!',
                      'How do you do?','Hi, nice to meet you too',
                      'It is a pleasure to meet you','morning!!!',
                      'top to you', 'what to eat',
                      'do you like drinking', 'energy shortage',
                      'why not eat', 'do you like chatterbot',
                      'what do you eat apart from food', 'do you wish to take food',
                      'can you get drunk', 'I like wine',
                      'what do you need to survice', 'do you able eat?',
                      'FinTech', 'RegTech',
                      'what is your interest', 'tell me your favorite subjects',
                      'tell me your number', 'tell me your favorite number',
                      'tell me your location', 'do you live on earth',
                      'where are you from', 'where are you',
                      'do you have brothers', 'do you have sisters or brothers',
                      'who is your dad', 'tell me your mother',
                      'who is your boss', 'how old are you']
               


create_correct_id = create_data[1]
tfidf_result, tfidf_id = tfidf_model(create_satatements, create_data)
lda_result, lda_id = lda_model(create_satatements, create_data)
lsi_result, lsi_id = lsi_model(create_satatements, create_data)
doc_result, doc_id = doc_model(create_satatements, create_data)
difflib_result, difflib_id = difflib_model(create_satatements, create_data)

In [20]:
tfidf_id = np.array(tfidf_id).reshape(-1, 1)
create_correct_id = np.array(create_correct_id).reshape(-1, 1)
lda_id = np.array(lda_id).reshape(-1, 1)
lsi_id = np.array(lsi_id).reshape(-1, 1) 
doc_id = np.array(doc_id).reshape(-1, 1) 
difflib_id = np.array(difflib_id).reshape(-1, 1)  

In [21]:
print(sum(tfidf_id== create_correct_id)/len(create_correct_id))
print(sum(lda_id== create_correct_id)/len(create_correct_id))
print(sum(lsi_id== create_correct_id)/len(create_correct_id))
print(sum(doc_id== create_correct_id)/len(create_correct_id))
print(sum(difflib_id== create_correct_id)/len(create_correct_id))

[0.73684211]
[0.57894737]
[0.71052632]
[0.]
[0.76315789]
