### Import libraries and the review file

In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/teohangxanh/Practice-Data-Science/master/FarLandMD/zocdoc%20reviews.csv', encoding = "ISO-8859-1")
pd.set_option('max_colwidth', 200)
print(df.shape)

(71, 2)


### Clean the dataset

In [2]:
df.head()

Unnamed: 0,Doctor,Reviews
0,"Dr. Jon Biorkman, MD",['This review is an overdue huge thank you for an extraordinary medical doctor whose care and professionalism helped me even in his absence: I was Dr. Biorkman\'s patient while in Irvine until 20...
1,,
2,"Richard McConkie, FNP-C","[""I was so pleased with the kindness and care of the staff at west valley. This was my first visit and it felt like I'd been going there forever. Dr Richard was very empathic and understanding. Li..."
3,,
4,"Dr. Crystal Song, NMD","[""I go monthly to Dr Song to strengthen my abdomen which has a hernia caused by a botched surgery 2 years ago. I am extremely satisfied by the high expertise and genuine care that Dr Song gives m..."


In [3]:
# Remove rows that have missing data in all columns: Doctor and Reviews
df.dropna(how='all', inplace=True)

In [4]:
df.head()

Unnamed: 0,Doctor,Reviews
0,"Dr. Jon Biorkman, MD",['This review is an overdue huge thank you for an extraordinary medical doctor whose care and professionalism helped me even in his absence: I was Dr. Biorkman\'s patient while in Irvine until 20...
2,"Richard McConkie, FNP-C","[""I was so pleased with the kindness and care of the staff at west valley. This was my first visit and it felt like I'd been going there forever. Dr Richard was very empathic and understanding. Li..."
4,"Dr. Crystal Song, NMD","[""I go monthly to Dr Song to strengthen my abdomen which has a hernia caused by a botched surgery 2 years ago. I am extremely satisfied by the high expertise and genuine care that Dr Song gives m..."
6,"Dr. Christopher Ciccone, MD",['Excellent service and advice as well as a perscrition for spider bite infection\n\nDr. Ciccone has been my doc for 25 years and is now the family doc for my children and husband too! His offic...
8,"Dr. Martin Maag, MD","['Very positive and encouraging, time well spent.\n\nFriendly, professional, and knowledgeable.\n\nits not often that the Doctor is the one to greet you and sit you down in the consultation room, ..."


In [5]:
pattern = '|'.join(['\[', '\]', '\\\\\\', '\'', "\'", '\\n', '\\\\', '  '])
df.Reviews = df.Reviews.str.replace(pattern, '')

In [6]:
df.iloc[1, 1]

'"I was so pleased with the kindness and care of the staff at west valley. This was my first visit and it felt like Id been going there forever. Dr Richard was very empathic and understanding. Listened to my issues and was a wonderful breath of fresh air in this crazy world we find ourselves in. Thank you again.", Excellent! Staff was helpful and considerate. Richard was amazingly kind and patient with my handicapped daughter! Would definitely recommend their office!Office was super clean and staff was very friendly! The provider was great! I would recommend to all my friends and family.My son and I are new patients of Dr. McConkie, at our hometown West Valley Med. Clinic in Middleton. From the moment you walk in the door they made us both feel like we were right at home, and Dr. McConkie was very thorough with addressing all of our concerns, nThank you Dr. McConkie!Always pleasant.Richard does a great job and as long as he is in the Middleton Clinic, I will travel there for my appoint

In [7]:
import spacy
from spacy.lang.en import English

nlp = spacy.load('en_core_web_lg')
doc = nlp(df.iloc[0, 1])

In [8]:
# Create a list of word tokens
filtered_words = []
for token in doc:
    # Remove stop words
    if token.is_stop == False and token.is_punct == False:
        filtered_words.append(token.text)
print(filtered_words)

['review', 'overdue', 'huge', 'thank', 'extraordinary', 'medical', 'doctor', 'care', 'professionalism', 'helped', 'absence', 'Dr.', 'Biorkmans', 'patient', 'Irvine', '2006', 'moved', 'N.', 'Cal', 'diagnosed', '7', 'years', 'ago', 'mass', 'feared', 'life', 'looked', 'mirror', 'asked', 'afraid?"The', 'answer', 'came', 'promptly', 'saw', 'doctor', 'Dr.', 'Biorkman', 'know', 'doctor', 'trusted', 'professional', 'accurate', 'answer', 'drama', 'got', 'right', 'single', 'time', 'helped', 'stay', 'healthy', 'Having', 'doctor', 'trusts', 'essential', 'Dr', 'Biorkman', 'doctor', '30', 'years', 'knows', 'family', 'cares', 'health', 'takes', 'time', 'listen', 'provides', 'thoughtful', 'excellent', 'care', 'member', 'family', 'years', 'sure', 'insurance', 'plan', 'choose', 'gives', 'access', 'Dr.', 'Biorkman', 'Dr.', 'Biorkman', 'mean', 'listened', 'carefully', 'concerns', 'advised', 'sagely', 'cared', 'person', 'nt', 'matter', 'large', 'small', 'health', 'issue', 'provides', 'comfort', 'care', 'Pr

### Can we create a filtered adjectives which have positive or negative meanings of the doctor, not irrelevant ones?

In [9]:
# Create a list of sentence tokens

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

['This review is an overdue huge thank you for an extraordinary medical doctor whose care and professionalism helped me even in his absence:I was Dr. Biorkmans patient while in Irvine until 2006, when I moved to N. Cal.', 'When I was diagnosed 7 years ago with a mass, and I feared for my life, I looked in the mirror and asked: "When would you not be afraid?"The answer came promptly:', '"If I saw MY doctor" (Dr. Biorkman).', 'So even if he does not know, he is still MY doctor - the one I always trusted,had a professional, accurate answer with no drama, got it right every single time, and helped me stay healthy.', 'Having a doctor one trusts is essential.', 'Dr.', 'Biorkman has been our doctor for almost 30 years.', 'He knows our family and cares about our health.', 'He takes his time to listen and provides thoughtful and excellent care to every member of our family.', 'Over the years we have made sure any insurance plan we choose gives us access to Dr. Biorkman., "Dr. Biorkman has alway

In [10]:
# for chunk in doc.noun_chunks:
#     print('%40s' % chunk.text, '%10s' % chunk.root.text, '%10s' % chunk.root.dep_, '%10s' % chunk.root.head.text, sep='\t')

In [18]:
from spacy.matcher import Matcher
doc = nlp(df.iloc[0, 1])
matcher = Matcher(nlp.vocab)
p1 = [{'ORTH': 'Dr.', 'OP': '*'},
           {'ENT_TYPE': 'PERSON'},
           {'POS': 'ADV', 'OP': '*'},
           {'LEMMA': 'be'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ'}]
p2 = [{'LOWER':{'IN': ['he', 'she']}},
           {'POS': 'ADV', 'OP': '*'},
           {'LEMMA': 'be'},
           {'POS': 'ADV', 'OP': '*'},
           {'DEP': 'CARDINAL', 'OP': '*'},
           {'POS': 'ADJ'},
           {'POS': 'NOUN', 'OP': '*'}]
p3 = [{'LOWER':{'IN': ['he', 'she']}},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'VERB'},
           {'DEP': 'TRUE', 'OP': '*'},
           {'POS': 'ADJ', 'OP': '*'},
           {'POS': 'NOUN'}]
p4 = [{'ORTH': 'Dr.', 'OP': '*'},
           {'ENT_TYPE': 'PERSON'},
           {'POS': 'ADV', 'OP': '*'},
           {'LEMMA': 'be'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ', 'OP': '+'},
           {'ORTH': 'and', 'OP': '*'},
           {}]
p5 = [{'POS': 'NOUN'},
           {'LEMMA': 'be'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ'},
           {'ORTH': ',', 'OP': '*'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ'},
           {'ORTH': ',', 'OP': '*'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ'},
           {'ORTH': ', and', 'OP': '*'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ'}]
p6 = [{'LOWER':{'IN': ['he', 'she']}},
           {'LEMMA': 'be'},
           {'POS': 'ADJ'},
           {'ORTH': ',', 'OP': '?'},
           {'POS': 'ADJ', 'OP': '?'},
           {'ORTH': ',', 'OP': '?'}, 
           {'ORTH': 'and', 'OP': '?'},
           {'POS': 'ADJ', 'OP': '?'}]
p7 = [{'LOWER':{'IN': ['he', 'she']}},
      {'POS': 'VERB'}
     ]
patterns = [p1, p2, p3, p4, p5, p6, p7]
matcher.add("review", None, *patterns)

for match_id, start, end in matcher(doc):
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(span.text)

He knows
He takes
he provides
he lined
he take
he take care
he communicated
Biorkman is awesome
Biorkman is awesome!
He is pleasant
He is pleasant,
He is pleasant, funny
He is pleasant, funny and
He is pleasant, funny and caring
hes
Dr. Biorkman is very thorough
Biorkman is very thorough
Dr. Biorkman is very thorough and
Biorkman is very thorough and
Dr. Biorkman is very thorough and listens
Biorkman is very thorough and listens
He got
He may
he is very personable


In [12]:
# text1 = nlp('This cake is very good')
# text2 = nlp('This cup cake is really good')
# print(text1.similarity(text2))

In [13]:
# text1 = nlp('good')
# text2 = nlp('bad')
# print(text1.similarity(text2))