In [None]:
import pandas as pd
filename='sample_data_complaint.xlsx'
df=pd.read_excel(filename, usecols =["Complaint Text","Resolution"])
df.columns=["Complaint Text","Resolution"]
df


Unnamed: 0,Complaint Text,Resolution
0,A leakage was observed when using Alinity i-se...,A malfunction was identified regarding a leaka...
1,The customer indicated product shipment was re...,A malfunction was identified regarding ARCHITE...
2,Shipment of ARCHITECT Concentrated Wash Buffer...,A malfunction regarding Shipment of ARCHITECT ...
3,Alinity ci series System Software v2.6.2 on S...,A malfunction was identified for the Alinity c...
4,The customer observed bottles which appeared t...,No product deficiency and no malfunction was i...
...,...,...
95,The customer observed falsely elevated results...,A malfunction was identified regarding elevate...
96,The customer questioned higher than expected T...,A malfunction regarding higher Troponin result...
97,The customer reported insufficient fill volume...,A malfunction regarding insufficient fill volu...
98,The complaint text states that when customer r...,A malfunction was identified for the ARCHITECT...


In [None]:
import re
import gensim
from gensim.parsing.preprocessing import remove_stopwords

def clean_sentence(sentence,stopwords=False):
  sentence=sentence.lower().strip()
  sentence=re.sub(r'[^a-z0-9/s]',' ',sentence)

  if stopwords:
    sentence=remove_stopwords(sentence)

  return sentence

def get_cleaned_sentences(df,stopwords=False):
  sentences=df[['Complaint Text']]
  cleaned_sentences=[]

  for index,row in df.iterrows():
    cleaned=clean_sentence(row['Complaint Text'],stopwords)
    cleaned_sentences.append(cleaned)
  return cleaned_sentences
  
cleaned_sentences=get_cleaned_sentences(df,stopwords=True)
print(cleaned_sentences)

print("\n")

cleaned_sentences_with_stopwords=get_cleaned_sentences(df,stopwords=False)
print(cleaned_sentences_with_stopwords)


    

['leakage observed alinity series concentrated wash buffer', 'customer indicated product shipment received warm temperature stressed suitable use', 'shipment architect concentrated wash buffer list number 06c54 88 lot number 08381fn00 damaged transit fedex refused customer', 'alinity ci series software v2 6 2 scm01493 westgard rules aren t triggered versions assay installed instrument', 'customer observed bottles appeared distended architect concentrated wash buffer kit', 'customer observed activated read failures error code 1007 architect reaction vessels rv', 'customer observed microparticles bottle containing mold type deposits architect intact pth reagent', 'customer reported observing falsely elevated ldl hdl results 5 patients direct ldl reagent ln 1e31 20 ultra hdl reagent ln 3k33 21 architect 1601094 sum ldl hdl results patient greater total cholesterol results', 'customer observed multiple levels non abbott controls shifting high architect bnp calibrator', 'assay package inser

BAG OF WORDS MODEL

In [None]:
import numpy as np

sentences=cleaned_sentences_with_stopwords

sentence_words=[[word for word in document.split()]for document in sentences]

from gensim import corpora
dictionary=corpora.Dictionary(sentence_words)
for key,value in dictionary.items():
  print(key," : ",value)

import pprint

bow_corpus=[dictionary.doc2bow(text) for text in sentence_words]
for sent,embedding in zip(sentences,bow_corpus):
  print(sent)
  print(embedding)

question_orig="product shipment was received warm and was not suitable for use"
question=clean_sentence(question_orig,stopwords=False)
question_embedding=dictionary.doc2bow(question.split())

print("\n\n",question,"\n",question_embedding)

0  :  a
1  :  alinity
2  :  buffer
3  :  concentrated
4  :  i
5  :  leakage
6  :  observed
7  :  series
8  :  using
9  :  was
10  :  wash
11  :  when
12  :  and
13  :  customer
14  :  for
15  :  indicated
16  :  not
17  :  product
18  :  received
19  :  shipment
20  :  stressed
21  :  suitable
22  :  temperature
23  :  the
24  :  therefore
25  :  use
26  :  warm
27  :  06c54
28  :  08381fn00
29  :  88
30  :  architect
31  :  by
32  :  damaged
33  :  fedex
34  :  in
35  :  list
36  :  lot
37  :  number
38  :  of
39  :  refused
40  :  transit
41  :  2
42  :  6
43  :  are
44  :  aren
45  :  assay
46  :  ci
47  :  installed
48  :  instrument
49  :  on
50  :  rules
51  :  same
52  :  scm01493
53  :  software
54  :  system
55  :  t
56  :  there
57  :  triggered
58  :  two
59  :  v2
60  :  versions
61  :  westgard
62  :  where
63  :  while
64  :  appeared
65  :  be
66  :  bottles
67  :  distended
68  :  kit
69  :  to
70  :  which
71  :  1007
72  :  activated
73  :  code
74  :  error
75  :  fa

COSINE SIMILARITY


In [None]:
import sklearn 
from sklearn.metrics.pairwise import cosine_similarity

def retrieveAndPrintFAQAnswer(question_embedding,sentence_embeddings,FAQdf,sentences):
  max_sim=-1
  index_sim=-1
  for index,faq_embedding in enumerate(sentence_embeddings):
    sim=cosine_similarity(faq_embedding,question_embedding)[0][0]
    print(index,sim,sentences[index])
    if sim>max_sim:
      max_sim=sim
      index_sim=index
  
  print("\n")
  print("question : ",question)
  print("\n")
  print("Retrieved : ",FAQdf.iloc[index_sim,0])
  print("\n")
  print(FAQdf.iloc[index_sim,1])

retrieveAndPrintFAQAnswer(question_embedding,bow_corpus,df,sentences)

0 0.21693045781865616 a leakage was observed when using alinity i series concentrated wash buffer 
1 0.9941724026247171 the customer indicated product shipment was received warm  temperature stressed  and therefore not suitable for use
2 0.970142500145332 shipment of architect concentrated wash buffer  list number 06c54 88  lot number 08381fn00  was damaged in transit by fedex and refused by the customer
3 0.8436614877321074 alinity ci series system software v2 6 2 on scm01493 where the westgard rules aren t triggered while there are two versions of the same assay installed on the instrument 
4 0.970142500145332 the customer observed bottles which appeared to be distended when using architect concentrated wash buffer kit
5 0.9985681322700889 the customer observed activated read failures  error code 1007   when using architect reaction vessels  rv 
6 0.21693045781865616 the customer observed a microparticles bottle containing   mold type deposits    while using architect intact pth reag

GLOVE EMBEDDING

In [None]:
from gensim.models import word2vec
import gensim.downloader as api

glove_model=None
try:
  glove_model=gensim.models.KeyedVectors.load("./glovemodel.mod")
  print("loaded glove model")

except:
  glove_model=api.load("glove-twitter-25")
  glove_model.save("./glovemodel.mod")
  print("saved glove model")

v2w_model=None
try:
  v2w_model=gensim.models.KeyedVectors.load("./w2vecmodel.mod")
  print("loaded w2v model")

except:
  v2w_model=api.load("word2vec-google-news-300")
  v2w_model.save("./w2vecmodel.mod")
  print("saved w2v model")

w2vec_embedding_size=len(v2w_model['computer'])
glove_embedding_size=len(glove_model['computer'])

FIXING NEW WORDS THAT ARE NOT IN THE DICTIONARY

In [None]:
def getWordVec(word,model):
  samp=model['computer']
  vec=[0]*len(samp)
  try:
    vec=model[word]
  except:
    vec=[0]*len(samp)
  return (vec)

def getPhraseEmbedding(phrase,embeddingmodel):
  samp=getWordVec('computer',embeddingmodel)
  vec=np.array([0]*len(samp))
  den=0
  for word in phrase.split():
    den=den+1
    vec=vec+np.array(getWordVec(word,embeddingmodel))

  return vec.reshape(1,-1)

WITH W2VEC

In [None]:
sent_embeddings=[]
for sent in cleaned_sentences:
  sent_embeddings.append(getPhraseEmbedding(sent,v2w_model))

question_embedding=getPhraseEmbedding(question,v2w_model)
retrieveAndPrintFAQAnswer(question_embedding,sent_embeddings,df,cleaned_sentences)

0 0.34012017205464523 leakage observed alinity series concentrated wash buffer
1 0.7974641908005894 customer indicated product shipment received warm temperature stressed suitable use
2 0.545219782014277 shipment architect concentrated wash buffer list number 06c54 88 lot number 08381fn00 damaged transit fedex refused customer
3 0.34787633583491906 alinity ci series software v2 6 2 scm01493 westgard rules aren t triggered versions assay installed instrument
4 0.4312455821152203 customer observed bottles appeared distended architect concentrated wash buffer kit
5 0.42240744782387707 customer observed activated read failures error code 1007 architect reaction vessels rv
6 0.40230690079681913 customer observed microparticles bottle containing mold type deposits architect intact pth reagent
7 0.3512663053895849 customer reported observing falsely elevated ldl hdl results 5 patients direct ldl reagent ln 1e31 20 ultra hdl reagent ln 3k33 21 architect 1601094 sum ldl hdl results patient grea

In [None]:
pip install ktrain

Collecting ktrain
[?25l  Downloading https://files.pythonhosted.org/packages/e1/3c/8469632f3fa51f244ce35ac184de4c55a260dccfcb7386529faf82ebf60f/ktrain-0.25.4.tar.gz (25.3MB)
[K     |████████████████████████████████| 25.3MB 1.5MB/s 
[?25hCollecting scikit-learn==0.23.2
[?25l  Downloading https://files.pythonhosted.org/packages/5c/a1/273def87037a7fb010512bbc5901c31cfddfca8080bc63b42b26e3cc55b3/scikit_learn-0.23.2-cp36-cp36m-manylinux1_x86_64.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 49.8MB/s 
Collecting langdetect
[?25l  Downloading https://files.pythonhosted.org/packages/56/a3/8407c1e62d5980188b4acc45ef3d94b933d14a2ebc9ef3505f22cf772570/langdetect-1.0.8.tar.gz (981kB)
[K     |████████████████████████████████| 983kB 68.4MB/s 
Collecting cchardet
[?25l  Downloading https://files.pythonhosted.org/packages/a0/e5/a0b9edd8664ea3b0d3270c451ebbf86655ed9fc4c3e4c45b9afae9c2e382/cchardet-2.1.7-cp36-cp36m-manylinux2010_x86_64.whl (263kB)
[K     |██████████████████████████

BERT EMBEDDINGS

The issue is the server setting part where the bert_serving_client is not executing.

In [None]:
from bert_serving.client import BertClient
bc=BertClient(ip='localhost')
res=bc.encode(['ml','ai'])
print(res)

KeyboardInterrupt: ignored

In [None]:
question=clean_sentence(question,orig,stopwords=False)
cleaned_snetences=get_cleaned_sentences(df,stopwords=False)

sent_bertphrase_embeddings=[]
for sent in cleaned_sentences:
  sent_bertphrase_embeddings.append(bc.encode([sent]))

question_embedding=bc.encode([question])
retrieveAndPrintFAQAnswer(question_embedding,sent_bertphrase_embeddings,df,cleaned_sentences)

In [None]:
pip install bert-serving-server

[31mERROR: Operation cancelled by user[0m


In [None]:
pip install bert-serving-client



In [None]:
pip install bert-embedding

Collecting bert-embedding
  Downloading https://files.pythonhosted.org/packages/62/85/e0d56e29a055d8b3ba6da6e52afe404f209453057de95b90c01475c3ff75/bert_embedding-1.0.1-py3-none-any.whl
Collecting numpy==1.14.6
[?25l  Downloading https://files.pythonhosted.org/packages/e5/c4/395ebb218053ba44d64935b3729bc88241ec279915e72100c5979db10945/numpy-1.14.6-cp36-cp36m-manylinux1_x86_64.whl (13.8MB)
[K     |████████████████████████████████| 13.8MB 294kB/s 
[?25hCollecting typing==3.6.6
  Downloading https://files.pythonhosted.org/packages/4a/bd/eee1157fc2d8514970b345d69cb9975dcd1e42cd7e61146ed841f6e68309/typing-3.6.6-py3-none-any.whl
Collecting mxnet==1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/c0/e9/241aadccc4522f99adee5b6043f730d58adb7c001e0a68865a3728c3b4ae/mxnet-1.4.0-py2.py3-none-manylinux1_x86_64.whl (29.6MB)
[K     |████████████████████████████████| 29.6MB 1.4MB/s 
[?25hCollecting gluonnlp==0.6.0
[?25l  Downloading https://files.pythonhosted.org/packages/e2/07/03