# IMPORTING LIBRARIES

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


#  Data Loading

In [11]:
df = pd.read_csv("R:/IIT_MADRAS_DATA_SCINCE/DataSet/Healthcare Reviews/healthcare_reviews.csv")

In [12]:
df.loc[df['Rating'] > 3, 'Sentiment'] = '1'
df.loc[df['Rating'] == 3, 'Sentiment'] = '0'
df.loc[df['Rating'] < 3, 'Sentiment'] = '-1'

In [14]:
df

Unnamed: 0,Review_Text,Rating,Sentiment
0,I have mixed feelings about my experience.,4,1
1,The staff was caring and attentive. I couldn't...,5,1
2,I have mixed feelings about my experience.,5,1
3,I have mixed feelings about my experience.,5,1
4,The healthcare provider was excellent. I had a...,3,0
...,...,...,...
995,My experience was terrible. I would not recomm...,5,1
996,The service was disappointing. I won't be comi...,4,1
997,"The service was okay, but nothing exceptional.",3,0
998,I have mixed feelings about my experience.,5,1


#  Handling Missing Data

In [16]:
df.isnull().sum()

Review_Text    100
Rating           0
Sentiment        0
dtype: int64

In [17]:
df.shape                                                                                

(1000, 3)

In [18]:
df = df.dropna()
df.shape

(900, 3)

# Lowercasing


In [19]:
df['Review_Text'] = df['Review_Text'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Review_Text'] = df['Review_Text'].str.lower()


In [20]:
df

Unnamed: 0,Review_Text,Rating,Sentiment
0,i have mixed feelings about my experience.,4,1
1,the staff was caring and attentive. i couldn't...,5,1
2,i have mixed feelings about my experience.,5,1
3,i have mixed feelings about my experience.,5,1
4,the healthcare provider was excellent. i had a...,3,0
...,...,...,...
995,my experience was terrible. i would not recomm...,5,1
996,the service was disappointing. i won't be comi...,4,1
997,"the service was okay, but nothing exceptional.",3,0
998,i have mixed feelings about my experience.,5,1


# Removing Special Characters

In [21]:
import re
df['cleaned_text'] = df['Review_Text'].apply(lambda x: re.sub(r'[^\w\s]','',x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = df['Review_Text'].apply(lambda x: re.sub(r'[^\w\s]','',x))


In [22]:
df

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...
...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience


# Tokenization

In [25]:
df['tokenized_reviews'] = df.apply(lambda row: nltk.word_tokenize(row['Review_Text']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokenized_reviews'] = df.apply(lambda row: nltk.word_tokenize(row['Review_Text']), axis=1)


In [26]:
# df = df.drop('tokenized_reviwes', axis=1)

In [27]:
df

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...,"[the, staff, was, caring, and, attentive, ., i..."
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...,"[the, healthcare, provider, was, excellent, .,..."
...,...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[my, experience, was, terrible, ., i, would, n..."
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[the, service, was, disappointing, ., i, wo, n..."
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[the, service, was, okay, ,, but, nothing, exc..."
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."


In [28]:
df

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...,"[the, staff, was, caring, and, attentive, ., i..."
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...,"[the, healthcare, provider, was, excellent, .,..."
...,...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[my, experience, was, terrible, ., i, would, n..."
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[the, service, was, disappointing, ., i, wo, n..."
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[the, service, was, okay, ,, but, nothing, exc..."
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."


#  Removing Stopwords

In [34]:
stop_words = set(stopwords.words('english'))

df['stopwords_reviews'] = df['tokenized_reviews'].apply(lambda x: [word for word in x if word not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stopwords_reviews'] = df['tokenized_reviews'].apply(lambda x: [word for word in x if word not in stop_words])


In [35]:
df

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews,lemmatized,stopwords_reviews
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]"
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...,"[staff, caring, attentive, ., could, n't, happ...","[staff, caring, attentive, ., could, n't, happ...","[staff, caring, attentive, ., could, n't, happ..."
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]"
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]"
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...,"[healthcare, provider, excellent, ., great, ex...","[healthcare, provider, excellent, ., great, ex...","[healthcare, provider, excellent, ., great, ex..."
...,...,...,...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[experience, terrible, ., would, recommend, pr...","[experience, terrible, ., would, recommend, pr...","[experience, terrible, ., would, recommend, pr..."
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[service, disappointing, ., wo, n't, coming, b...","[service, disappointing, ., wo, n't, coming, b...","[service, disappointing, ., wo, n't, coming, b..."
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[service, okay, ,, nothing, exceptional, .]","[service, okay, ,, nothing, exceptional, .]","[service, okay, ,, nothing, exceptional, .]"
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]"


# Lemmatization 

In [38]:
lemmatizer = WordNetLemmatizer()

df['lemmatized'] = df['stopwords_reviews'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lemmatized'] = df['stopwords_reviews'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [39]:
df


Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews,lemmatized,stopwords_reviews
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]"
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...,"[staff, caring, attentive, ., could, n't, happ...","[staff, caring, attentive, ., could, n't, happ...","[staff, caring, attentive, ., could, n't, happ..."
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]"
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]"
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...,"[healthcare, provider, excellent, ., great, ex...","[healthcare, provider, excellent, ., great, ex...","[healthcare, provider, excellent, ., great, ex..."
...,...,...,...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[experience, terrible, ., would, recommend, pr...","[experience, terrible, ., would, recommend, pr...","[experience, terrible, ., would, recommend, pr..."
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[service, disappointing, ., wo, n't, coming, b...","[service, disappointing, ., wo, n't, coming, b...","[service, disappointing, ., wo, n't, coming, b..."
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[service, okay, ,, nothing, exceptional, .]","[service, okay, ,, nothing, exceptional, .]","[service, okay, ,, nothing, exceptional, .]"
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]"


#  Rejoin Processed Tokens

In [40]:
df['processed_text'] = df['lemmatized'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['processed_text'] = df['lemmatized'].apply(lambda x: ' '.join(x))


In [41]:
df

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews,lemmatized,stopwords_reviews,processed_text
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]",mixed feeling experience .
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...,"[staff, caring, attentive, ., could, n't, happ...","[staff, caring, attentive, ., could, n't, happ...","[staff, caring, attentive, ., could, n't, happ...",staff caring attentive . could n't happier .
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]",mixed feeling experience .
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]",mixed feeling experience .
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...,"[healthcare, provider, excellent, ., great, ex...","[healthcare, provider, excellent, ., great, ex...","[healthcare, provider, excellent, ., great, ex...",healthcare provider excellent . great experien...
...,...,...,...,...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[experience, terrible, ., would, recommend, pr...","[experience, terrible, ., would, recommend, pr...","[experience, terrible, ., would, recommend, pr...",experience terrible . would recommend provider .
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[service, disappointing, ., wo, n't, coming, b...","[service, disappointing, ., wo, n't, coming, b...","[service, disappointing, ., wo, n't, coming, b...",service disappointing . wo n't coming back .
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[service, okay, ,, nothing, exceptional, .]","[service, okay, ,, nothing, exceptional, .]","[service, okay, ,, nothing, exceptional, .]","service okay , nothing exceptional ."
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[mixed, feelings, experience, .]","[mixed, feeling, experience, .]","[mixed, feelings, experience, .]",mixed feeling experience .


#  Encoding Labels

In [56]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
df