In [0]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.pipeline import FeatureUnion
from nltk.corpus import stopwords 
import os
import string

import heapq 

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import io
from google.colab import files

uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['data_hack.csv']))
df.head()

Saving data_hack.csv to data_hack.csv


Unnamed: 0,spec,dental clinic,treatment,insurance
0,Endodontist,"1355 15th St, Fort Lee, NJ 07024",root canal,healthfirst
1,Endodontist,"2999 Princeton Pike #4, Lawrenceville, NJ 08648",root canal,unitedhealthcare
2,Endodontist,"1357 15th St, Fort Lee, NJ 07024",root canal,healthfirst
3,Endodontist,"12 Roszel Rd, Princeton, NJ 08540",root canal,unitedhealthcare
4,Endodontist,"1359 15th St, Fort Lee, NJ 07024",root canal,healthfirst


In [0]:
doctorSpecAr = df['spec'].values
doctorTreatAr = df['treatment'].values
doctorInsuranceAr = df['insurance'].values
doctorLocationAr = df['dental clinic'].values

In [5]:
i = 4
resStr = doctorSpecAr[i] + ' ' + doctorTreatAr[i] + ' '+ doctorInsuranceAr[i]
resStr

'Endodontist root canal healthfirst'

In [0]:
def removeStopWords(wordList):
    return [word for word in wordList if word not in stopwords.words('english')]

def calcCosSim(a,b):
    # Calculate cosine simularity
    return np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))

def removeStem(sentence):
    ps = PorterStemmer()
    words = word_tokenize(sentence)
    tmpStr = ''
    for w in words:
        tmpStr += ps.stem(w) + ' '
    return tmpStr

In [0]:
customerStr = 'I am suffering from root canal	and need to be covered with UnitedHealthcare'

customerVec = removeStopWords([removeStem(customerStr.lower())])

# Preparing customer vector
vec = CountVectorizer()
customerFreq = vec.fit_transform(customerVec)
customerDf = pd.DataFrame(customerFreq.toarray(),columns=vec.get_feature_names())

resScore = []
resPara = []

for i in range(0,len(doctorSpecAr)):
    doctorStr = doctorSpecAr[i] + ' ' + doctorTreatAr[i] + ' ' + doctorInsuranceAr[i]
    doctorVec = removeStopWords([removeStem(doctorStr.lower())])
    
    # Prepare to doctor vector to combine vector
    doctorFreq = vec.fit_transform(doctorVec)
    doctorDf = pd.DataFrame(doctorFreq.toarray(),columns=vec.get_feature_names())
    
    # Combine vectors
    combinedDf = pd.concat([customerDf, doctorDf],sort=False).fillna(value=0.0)
    customerVec = combinedDf.iloc[0].values
    doctorVec = combinedDf.iloc[1].values
    
    # Preform cosine simularity
    simRes = calcCosSim(customerVec,doctorVec)
    
    # Appending results
    resScore.append(simRes)
    resPara.append(doctorStr)



In [34]:
rankAr = np.asarray(resScore).argsort()[::-1][:5]
heapLi = []

for ind in rankAr:    
    heapq.heappush(heapLi,[-resScore[ind],ind])
    
while len(heapLi) > 0:
    score,ind = heapq.heappop(heapLi)
    score = score*-1
    
    print(resPara[ind])
    print(doctorLocationAr[ind])
    print('')
    print(score)
    print('--------------------------\n')

Endodontist root canal unitedhealthcare
2999 Princeton Pike #4, Lawrenceville, NJ 08648

0.43301270189221935
--------------------------

Endodontist root canal unitedhealthcare
12 Roszel Rd, Princeton, NJ 08540

0.43301270189221935
--------------------------

Endodontist root canal healthfirst
1355 15th St, Fort Lee, NJ 07024

0.2886751345948129
--------------------------

Endodontist root canal metroplus
1354 15th St, Fort Lee, NJ 07024

0.2886751345948129
--------------------------

Endodontist root canal metroplus
601 Ewing St, Princeton, NJ 08540

0.2886751345948129
--------------------------

