In [1]:
import catboost as cb
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
from wordcloud import WordCloud,STOPWORDS
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import re


In [2]:
review_df=pd.read_csv("hospital_review.csv", index_col=[0])

In [3]:
review_df.head()

Unnamed: 0,star,review_content
0,5,In for a minor operation. Not long to wait bef...
1,5,Fantastic team at York stroke clinic really lo...
2,5,Huge thanks to the opthalmology team on this m...
3,1,Underfunded by the Conservatives.
4,5,Good experience in ED


#### Text Cleaning, Tokenization & Lemmatization

In [4]:
pattern = re.compile(r"(\w+)n't")

def expand_contractions(match):
    expanded = match.group(1) + " not"
    return expanded

def expanded_text(text):
    return  re.sub(pattern, expand_contractions, text)



In [5]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\y_tat\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def preprocess(x):
    stop = set(stopwords.words('english') + list(string.punctuation)) - set(["not"])
    x =x.lower()
    x = x.replace("can't","can not")
    x = x.replace("waiting", "wait")
    x = x.replace("caring", "care")
    x = expanded_text(x)
    tokenizer= nltk.TweetTokenizer()
    word_list = tokenizer.tokenize(x)
    lemmatizer = WordNetLemmatizer()
    lemmatized_output = ' '.join([lemmatizer.lemmatize(re.sub(r"\bdr\b", "doctor", w))for w in word_list if w not in stop])
    return lemmatized_output


In [7]:
review_df['review_preprocessed'] = review_df['review_content'].apply(preprocess) 

In [8]:
review_df.head()

Unnamed: 0,star,review_content,review_preprocessed
0,5,In for a minor operation. Not long to wait bef...,minor operation not long wait operation care s...
1,5,Fantastic team at York stroke clinic really lo...,fantastic team york stroke clinic really looke...
2,5,Huge thanks to the opthalmology team on this m...,huge thanks opthalmology team morning
3,1,Underfunded by the Conservatives.,underfunded conservative
4,5,Good experience in ED,good experience ed


In [9]:
review_df.isna().sum()

star                   0
review_content         0
review_preprocessed    0
dtype: int64

In [17]:
review_df[review_df["review_preprocessed"]!=""]

Unnamed: 0,star,review_content,review_preprocessed
0,5,In for a minor operation. Not long to wait bef...,minor operation not long wait operation care s...
1,5,Fantastic team at York stroke clinic really lo...,fantastic team york stroke clinic really looke...
2,5,Huge thanks to the opthalmology team on this m...,huge thanks opthalmology team morning
3,1,Underfunded by the Conservatives.,underfunded conservative
4,5,Good experience in ED,good experience ed
...,...,...,...
300,5,Amazing hospital. I had major stomach surgery ...,amazing hospital major stomach surgery york ho...
301,4,Great staff as is found throughout NHS.\nNot e...,great staff found throughout nh not enough vis...
305,5,Went to A&E at 09.30hrs with swollen face thin...,went e 09.30 hr swollen face thinking infectio...
307,5,never been but its a hospital and it sounds li...,never hospital sound like favourite cocolate b...


In [21]:
review_df.to_csv("hospital_review_afterpreprocessed.csv")