## This project's main purpose is to extract reviews from patients or their families on doctors. Then, reviews on the same doctor will be processed such that similar opinions are retained and become a brief and general reviews from multiple reviews on a specific doctor. Specifically, a graphical window will show up when our site user hovers on that doctor's info.

### Import libraries and the review file

In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/teohangxanh/Practice-Data-Science/master/FarLandMD/zocdoc%20reviews.csv', encoding = "ISO-8859-1")
pd.set_option('max_colwidth', 200)
pd.set_option("display.max_rows", 6)
print(df.shape)

(71, 2)


### Clean the dataset

In [2]:
df.head()

Unnamed: 0,Doctor,Reviews
0,"Dr. Jon Biorkman, MD",['This review is an overdue huge thank you for an extraordinary medical doctor whose care and professionalism helped me even in his absence: I was Dr. Biorkman\'s patient while in Irvine until 20...
1,,
2,"Richard McConkie, FNP-C","[""I was so pleased with the kindness and care of the staff at west valley. This was my first visit and it felt like I'd been going there forever. Dr Richard was very empathic and understanding. Li..."
3,,
4,"Dr. Crystal Song, NMD","[""I go monthly to Dr Song to strengthen my abdomen which has a hernia caused by a botched surgery 2 years ago. I am extremely satisfied by the high expertise and genuine care that Dr Song gives m..."


In [3]:
# Remove rows that have missing data in all columns: Doctor and Reviews
df.dropna(how='all', inplace=True)
df.reset_index(inplace=True)

In [4]:
df.head()

Unnamed: 0,index,Doctor,Reviews
0,0,"Dr. Jon Biorkman, MD",['This review is an overdue huge thank you for an extraordinary medical doctor whose care and professionalism helped me even in his absence: I was Dr. Biorkman\'s patient while in Irvine until 20...
1,2,"Richard McConkie, FNP-C","[""I was so pleased with the kindness and care of the staff at west valley. This was my first visit and it felt like I'd been going there forever. Dr Richard was very empathic and understanding. Li..."
2,4,"Dr. Crystal Song, NMD","[""I go monthly to Dr Song to strengthen my abdomen which has a hernia caused by a botched surgery 2 years ago. I am extremely satisfied by the high expertise and genuine care that Dr Song gives m..."
3,6,"Dr. Christopher Ciccone, MD",['Excellent service and advice as well as a perscrition for spider bite infection\n\nDr. Ciccone has been my doc for 25 years and is now the family doc for my children and husband too! His offic...
4,8,"Dr. Martin Maag, MD","['Very positive and encouraging, time well spent.\n\nFriendly, professional, and knowledgeable.\n\nits not often that the Doctor is the one to greet you and sit you down in the consultation room, ..."


In [5]:
pattern = '|'.join(['\[', '\]', '\\\\\\', '\'', "\'", '\\n', '\\\\', '  '])
df.Reviews = df.Reviews.str.replace(pattern, '')

In [6]:
df.iloc[1, 1]

'Richard McConkie, FNP-C'

In [7]:
import spacy
from spacy.lang.en import English

nlp = spacy.load('en_core_web_lg')
doc = nlp(df.iloc[1, 1])

In [8]:
# Create a list of word tokens
filtered_words = []
for token in doc:
    # Remove stop words
    if token.is_stop == False and token.is_punct == False:
        filtered_words.append(token.text)
print(filtered_words)

['Richard', 'McConkie', 'FNP', 'C']


### Can we create a filtered adjectives which have positive or negative meanings of the doctor, not irrelevant ones?

In [9]:
# Create a list of sentence tokens

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)

['Richard McConkie, FNP-C']


In [10]:
# for chunk in doc.noun_chunks:
#     print('%40s' % chunk.text, '%10s' % chunk.root.text, '%10s' % chunk.root.dep_, '%10s' % chunk.root.head.text, sep='\t')

In [11]:
from spacy.matcher import Matcher
doc = nlp(df.iloc[1, 1])
matcher = Matcher(nlp.vocab)
p1 = [{'ORTH': 'Dr.', 'OP': '*'},
           {'ENT_TYPE': 'PERSON'},
           {'POS': 'ADV', 'OP': '*'},
           {'LEMMA': 'be'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ'}]
p2 = [{'LOWER':{'IN': ['he', 'she']}},
           {'POS': 'ADV', 'OP': '*'},
           {'LEMMA': 'be'},
           {'POS': 'ADV', 'OP': '*'},
           {'DEP': 'CARDINAL', 'OP': '*'},
           {'POS': 'ADJ'},
           {'POS': 'NOUN', 'OP': '*'}]
p3 = [{'LOWER':{'IN': ['he', 'she']}},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'VERB'},
           {'DEP': 'TRUE', 'OP': '*'},
           {'POS': 'ADJ', 'OP': '*'},
           {'POS': 'NOUN'}]
p4 = [{'ORTH': 'Dr.', 'OP': '*'},
           {'ENT_TYPE': 'PERSON'},
           {'POS': 'ADV', 'OP': '*'},
           {'LEMMA': 'be'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ', 'OP': '+'},
           {'ORTH': 'and', 'OP': '*'},
           {}]
p5 = [{'POS': 'NOUN'},
           {'LEMMA': 'be'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ'},
           {'ORTH': ',', 'OP': '*'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ'},
           {'ORTH': ',', 'OP': '*'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ'},
           {'ORTH': ', and', 'OP': '*'},
           {'POS': 'ADV', 'OP': '*'},
           {'POS': 'ADJ'}]
p6 = [{'LOWER':{'IN': ['he', 'she']}},
           {'LEMMA': 'be'},
           {'POS': 'ADJ'},
           {'ORTH': ',', 'OP': '?'},
           {'POS': 'ADJ', 'OP': '?'},
           {'ORTH': ',', 'OP': '?'}, 
           {'ORTH': 'and', 'OP': '?'},
           {'POS': 'ADJ', 'OP': '?'}]
patterns = [p1, p2, p3, p4, p5, p6]
matcher.add("review", None, *patterns)
span_storage = []
for match_id, start, end in matcher(doc):
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    span_storage.append(span.text)

In [12]:
import numpy as np
sim_table = []
for doc1 in span_storage:
    for doc2 in span_storage:
        sim_table.append(nlp(doc1).similarity(nlp(doc2)))
sim_table = np.array(sim_table).astype(float).reshape(len(span_storage), len(span_storage))       

In [13]:
import pandas as pd
sim_table = pd.DataFrame(data=sim_table, columns=span_storage, index=span_storage)

In [14]:
sim_table

### From my observation, all extracted reviews that are from a same sentence (for example: 'Richard was amazingly kind', 'Richard was amazingly kind and', and 'Richard was amazingly kind and patient' have at least 90% semantic similarity. Thus, I will group those with at least 90% semantic similarity together and only keep the longest one among them.

In [15]:
span_storage

[]

In [16]:
sim_table = []
for doc1 in span_storage:
    sub = []
    for doc2 in span_storage:
        sub.append(nlp(doc1).similarity(nlp(doc2)))
    sim_table.append(sub)

In [17]:
def similar_phrases(j, a_list, threshold=.91):
    group = set()
    for i in range(len(a_list)):
        if a_list[i] >= threshold:
            group.add(i)
    return group
j = 0
my_list = []
for i in sim_table:
    if similar_phrases(j, i) not in my_list:
        my_list.append(similar_phrases(j, i))
        j += 1

In [18]:
my_list

[]

In [19]:
remove_list = []
for i in range(len(my_list)-1):
    if i < len(my_list) and my_list[i].issubset(my_list[i+1]):
        remove_list.append(my_list[i])
for i in range(len(my_list)-1, -1, -1):
    if i > 0 and my_list[i].issubset(my_list[i-1]):
        remove_list.append(my_list[i])       

In [20]:
remove_list

[]

In [21]:
for i in remove_list:
    my_list.remove(i)

In [22]:
my_list

[]

In [23]:
meaningful_phrases = []
for sublist in my_list:
    max_length = 0
    max_str = ''
    for phrase_index in sublist:
        if max_length < len(span_storage[phrase_index]):
            max_length = len(span_storage[phrase_index])
            max_str = span_storage[phrase_index]
    meaningful_phrases.append(max_str)

In [24]:
meaningful_phrases

[]