In [1]:
import pandas as pd
from gensim import corpora
from collections import defaultdict
from gensim import corpora, models, similarities
from gensim.models import TfidfModel
from random import shuffle

In [2]:
df = pd.read_csv('chat_bot_data.csv')

In [3]:
docs_q = list(df['question'])
docs_a = list(df['answer'])

# LSTM approach

In [None]:
from keras.layers import Embedding,LSTM,Bidirectional,Input,Lambda,Dense,Flatten
from keras.preprocessing.text import Tokenizer,one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model,Sequential

In [None]:
tok = Tokenizer()

In [None]:
docs = []
for i in range(75):
    docs.append(docs_q[i] + ' ' +docs_a[i])
    
docs = docs + docs_q
shuffle(docs)

tok.fit_on_texts(docs)    

In [None]:
LENGTH = 174
TOTAL_WORD_COUNT = len(tok.word_counts)
EMBEDDING_SIZE = 10
input_shape = (LENGTH,)

In [None]:
training = tok.texts_to_sequences(docs_q)
X = pad_sequences(training,maxlen=LENGTH)
Y = tok.texts_to_matrix(docs_q)
X = {'X1':X}

In [None]:
def create_encoder():
    model = Sequential()
    model.add(Embedding(TOTAL_WORD_COUNT+1,EMBEDDING_SIZE,input_length=LENGTH))
    model.add(Bidirectional(LSTM(8,return_sequences=False)))
    model.add(Dense(64,activation='relu'))
    return model

In [None]:
def create_network():
    X1 = Input(shape=input_shape,name='X1')
    
    encoder = create_encoder()
    
    x = encoder(X1)
#     x = Dense(64)(x)
#     x = Dense(128)(x)
    x = Dense(TOTAL_WORD_COUNT+1,activation='sigmoid')(x)
    
    model = Model(X1,x)
    model.compile(loss='binary_crossentropy',optimizer='rmsprop')
    return model,encoder

In [None]:
model,encoder = create_network()

In [None]:
model.summary()

In [None]:
model.fit(x=X,y=Y,epochs=10000,batch_size=200)

In [None]:
qs = tok.texts_to_sequences(docs_q)
qs_x = pad_sequences(qs,maxlen=LENGTH)
qs_queries = encoder.predict(qs_x)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from numpy import dot
from numpy.linalg import norm
def cos_sim(a,b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

In [None]:
def get_similar_ques(query):
    qs = tok.texts_to_sequences([query])
    qs_x = pad_sequences(qs,maxlen=LENGTH)
    queries = encoder.predict(qs_x)
#     print(queries)
    sims = cosine_similarity(qs_queries,queries)
#     print(sims)
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
#     print(sims)
#     print(sims)
    return sims[0][0]

In [None]:
doc = "What is the GMAT requirement to get admission to HBS?"
q = get_similar_ques(doc)
print(df.loc[q]['question'])

# LSI approach 2

In [None]:
docs_q = list(df['question'])

In [None]:
file_name = 'wiki.train.tokens'
text = open(file_name).read()

In [None]:
text = text.split('\n')

In [None]:
text = [i for i in text if len(i)>5]
text = [j for i in text for j in i.split(".") ]

all_docs = text[1:10000] + [i for i in docs_q for j in range(100)]
shuffle(all_docs)

In [None]:
# stoplist = set('for a of the and to in : ,'.split())
# stoplist = ['i','to','the']
def get_token(sentence):
    return sentence.split()
# stoplist = []
# texts = [[word for word in document.lower().split() if word not in stoplist]
#          for document in all_docs]
texts = map(get_token,all_docs)

# remove words that appear only once
# frequency = defaultdict(int)
# for text in texts:
#     for token in text:
#         frequency[token] += 1
# texts = [[token for token in text if frequency[token] > 1]
#          for text in texts]
dictionary = corpora.Dictionary(texts)
# dictionary.save('temp.dict')  # store the dictionary, for future reference
# print(dictionary)
texts = map(get_token,all_docs)
corpus = [dictionary.doc2bow(text) for text in texts]
# corpora.MmCorpus.serialize('temp.mm', corpus)  # store to disk, for later use
# print(corpus)
# tfidf = TfidfModel(corpus,id2word=dictionary)

# corpus_tfidf = tfidf[corpus]

# lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=75)

In [None]:
def get_similar_ques(doc):
    vec_bow = dictionary.doc2bow(doc.lower().split())
#     vec_tfidf = tfidf[vec_bow]
#     vec_lsi = lsi[vec_bow] # convert the query to LSI space
    vec_lsi = lsi[vec_bow]
    # print(vec_lsi)
    
    
    vec_qs = [dictionary.doc2bow(i.lower().split()) for i in docs_q]
#     vec_qs_tfidf = tfidf[vec_qs]
#     index = similarities.MatrixSimilarity(lsi[corpus])
    index = similarities.MatrixSimilarity(lsi[vec_qs])

    sims = index[vec_lsi] # perform a similarity query against the corpus
    # print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

    sims = sorted(enumerate(sims), key=lambda item: -item[1])
#     print(sims) # print sorted (document number, similarity score) 2-tuples
    return sims[0][0]

In [None]:
doc = "How can I reach out to HBS alumni for guidance?"
q = get_similar_ques(doc.replace('?',' ?'))
df.loc[q]['question']

# LSI approach 1 Submitted one

In [4]:
docs = []
for i in range(75):
    docs.append(docs_q[i] + ' ' +docs_a[i])
    
docs = docs + docs_q
shuffle(docs)

In [5]:
# stoplist = set('for a of the and to in : ,'.split())
# stoplist = ['i','to','the']
stoplist = []
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in docs]
# remove words that appear only once
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1
texts = [[token for token in text if frequency[token] > 1]
         for text in texts]
dictionary = corpora.Dictionary(texts)
# dictionary.save('temp.dict')  # store the dictionary, for future reference
# print(dictionary)

corpus = [dictionary.doc2bow(text) for text in texts]
# corpora.MmCorpus.serialize('temp.mm', corpus)  # store to disk, for later use
# print(corpus)
tfidf = TfidfModel(corpus,id2word=dictionary)

corpus_tfidf = tfidf[corpus]

# lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=20,onepass=False)

In [8]:
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=17,onepass=False)

doc = "What is the GMAT requirement to get admission to HBS?"
q = get_similar_ques(doc)
print(df.loc[q]['answer'])

No. We accept applicants with a wide range of test scores.
