In [1]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from transformers import BertForSequenceClassification, AutoTokenizer

from nltk.corpus import stopwords
import re

import emoji
from transformers import pipeline


In [None]:
review_df = pd.read_csv("hospital_review_afterpreprocessed.csv",index_col=[0])

### Word count, Character count, Emoji count

In [32]:
def wordcount(x):
    return len(re.findall(r'\w+',x))

In [33]:
review_df['word_count']= review_df['review_content'].apply(wordcount)

In [34]:
review_df['char_count']= review_df['review_content'].apply(lambda x:len(x))

In [35]:
review_df['emoji_count']= review_df['review_content'].apply(lambda x: emoji.emoji_count(x))

In [36]:
review_df.isna().sum()

star                   0
review_content         0
review_preprocessed    0
word_count             0
char_count             0
emoji_count            0
dtype: int64

### Pre-trained Sentiment Analysis Result

#### Vader

In [37]:
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\y_tat\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [38]:
# vader_sentiment_result("It is bad")
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores("They are excellent")['compound']

0.5719

In [39]:
analyzer = SentimentIntensityAnalyzer()
review_df['vader_result_c']=review_df["review_content"].apply(lambda x: analyzer.polarity_scores(x)['compound'])

In [40]:

review_df.head()

Unnamed: 0,star,review_content,review_preprocessed,word_count,char_count,emoji_count,vader_result_c
0,5,In for a minor operation. Not long to wait bef...,minor operation not long wait operation care s...,30,175,0,0.624
1,5,Fantastic team at York stroke clinic really lo...,fantastic team york stroke clinic really looke...,61,348,0,0.9423
2,5,Huge thanks to the opthalmology team on this m...,huge thanks opthalmology team morning,9,52,0,0.6369
3,1,Underfunded by the Conservatives.,underfunded conservative,4,33,0,0.0
4,5,Good experience in ED,good experience ed,4,21,0,0.4404


#### Bert (distilbert-base-uncased)

In [41]:
classifer= pipeline("sentiment-analysis")
def distillbert_sentiment_result(text): 
    label = classifer(text, truncation=True)[0]['label']
    score =classifer(text, truncation=True)[0]['score']
    return [score if label == "POSITIVE" else -(score) if label == "NEGATIVE" else "NA", label, score]

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [42]:

review_df["bert_result"]=review_df['review_content'].apply(lambda x: distillbert_sentiment_result(x)[0])
review_df["bert_label"]=review_df['review_content'].apply(lambda x: distillbert_sentiment_result(x)[1])
review_df["bert_score"]=review_df['review_content'].apply(lambda x: distillbert_sentiment_result(x)[2])

In [43]:
review_df['bert_label'].value_counts()

bert_label
POSITIVE    142
NEGATIVE     63
Name: count, dtype: int64

In [44]:
review_df[review_df["bert_result"]<0]

Unnamed: 0,star,review_content,review_preprocessed,word_count,char_count,emoji_count,vader_result_c,bert_result,bert_label,bert_score
3,1,Underfunded by the Conservatives.,underfunded conservative,4,33,0,0.0000,-0.997657,NEGATIVE,0.997657
5,1,Consultant told me l needed my right ear clea...,consultant told l needed right ear cleaning mi...,179,925,0,-0.9712,-0.999741,NEGATIVE,0.999741
8,3,Disabled parking is always full during my moth...,disabled parking always full mother's appointm...,28,155,0,-0.4168,-0.999591,NEGATIVE,0.999591
10,5,"I had a great experience, my issue was efficie...",great experience issue efficiently investigate...,30,183,0,0.8807,-0.993488,NEGATIVE,0.993488
19,1,I not sure this hospital is so kind as I good ...,not sure hospital kind good good si friend two...,87,386,0,0.5262,-0.995061,NEGATIVE,0.995061
...,...,...,...,...,...,...,...,...,...,...
286,1,Bad service left dirty blood stained items wit...,bad service left dirty blood stained item with...,37,235,0,-0.6950,-0.998005,NEGATIVE,0.998005
290,1,You are better off sticking a big stick in you...,better sticking big stick back hole get wonder...,53,274,0,0.8211,-0.950339,NEGATIVE,0.950339
301,4,Great staff as is found throughout NHS.\nNot e...,great staff found throughout nh not enough vis...,27,155,0,0.6249,-0.998258,NEGATIVE,0.998258
305,5,Went to A&E at 09.30hrs with swollen face thin...,went e 09.30 hr swollen face thinking infectio...,62,338,0,0.7572,-0.983240,NEGATIVE,0.983240


In [45]:
distillbert_sentiment_result("it was bad")

[-0.999790608882904, 'NEGATIVE', 0.999790608882904]

In [46]:
review_df

Unnamed: 0,star,review_content,review_preprocessed,word_count,char_count,emoji_count,vader_result_c,bert_result,bert_label,bert_score
0,5,In for a minor operation. Not long to wait bef...,minor operation not long wait operation care s...,30,175,0,0.6240,0.951596,POSITIVE,0.951596
1,5,Fantastic team at York stroke clinic really lo...,fantastic team york stroke clinic really looke...,61,348,0,0.9423,0.999813,POSITIVE,0.999813
2,5,Huge thanks to the opthalmology team on this m...,huge thanks opthalmology team morning,9,52,0,0.6369,0.999368,POSITIVE,0.999368
3,1,Underfunded by the Conservatives.,underfunded conservative,4,33,0,0.0000,-0.997657,NEGATIVE,0.997657
4,5,Good experience in ED,good experience ed,4,21,0,0.4404,0.999857,POSITIVE,0.999857
...,...,...,...,...,...,...,...,...,...,...
300,5,Amazing hospital. I had major stomach surgery ...,amazing hospital major stomach surgery york ho...,110,596,0,0.8290,0.994829,POSITIVE,0.994829
301,4,Great staff as is found throughout NHS.\nNot e...,great staff found throughout nh not enough vis...,27,155,0,0.6249,-0.998258,NEGATIVE,0.998258
305,5,Went to A&E at 09.30hrs with swollen face thin...,went e 09.30 hr swollen face thinking infectio...,62,338,0,0.7572,-0.983240,NEGATIVE,0.983240
307,5,never been but its a hospital and it sounds li...,never hospital sound like favourite cocolate b...,29,144,0,0.5023,-0.992069,NEGATIVE,0.992069


In [47]:
col_to_drop =['bert_label','bert_score']
review_df = review_df.drop(col_to_drop,axis=1)

In [48]:
review_df.head()

Unnamed: 0,star,review_content,review_preprocessed,word_count,char_count,emoji_count,vader_result_c,bert_result
0,5,In for a minor operation. Not long to wait bef...,minor operation not long wait operation care s...,30,175,0,0.624,0.951596
1,5,Fantastic team at York stroke clinic really lo...,fantastic team york stroke clinic really looke...,61,348,0,0.9423,0.999813
2,5,Huge thanks to the opthalmology team on this m...,huge thanks opthalmology team morning,9,52,0,0.6369,0.999368
3,1,Underfunded by the Conservatives.,underfunded conservative,4,33,0,0.0,-0.997657
4,5,Good experience in ED,good experience ed,4,21,0,0.4404,0.999857


In [49]:
review_df.isna().sum()

star                   0
review_content         0
review_preprocessed    0
word_count             0
char_count             0
emoji_count            0
vader_result_c         0
bert_result            0
dtype: int64

In [50]:
review_df.to_csv("hospital_review_sent.csv")