# IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


#  Data Loading

In [2]:
df = pd.read_csv("R:/IIT_MADRAS_DATA_SCINCE/DataSet/Healthcare Reviews/healthcare_reviews.csv")

In [3]:
df.loc[df['Rating'] > 3, 'Sentiment'] = '1'
df.loc[df['Rating'] == 3, 'Sentiment'] = '0'
df.loc[df['Rating'] < 3, 'Sentiment'] = '-1'

In [4]:
df

Unnamed: 0,Review_Text,Rating,Sentiment
0,I have mixed feelings about my experience.,4,1
1,The staff was caring and attentive. I couldn't...,5,1
2,I have mixed feelings about my experience.,5,1
3,I have mixed feelings about my experience.,5,1
4,The healthcare provider was excellent. I had a...,3,0
...,...,...,...
995,My experience was terrible. I would not recomm...,5,1
996,The service was disappointing. I won't be comi...,4,1
997,"The service was okay, but nothing exceptional.",3,0
998,I have mixed feelings about my experience.,5,1


#  Handling Missing Data

In [5]:
df.isnull().sum()

Review_Text    100
Rating           0
Sentiment        0
dtype: int64

In [6]:
df.shape                                                                                

(1000, 3)

In [7]:
df = df.dropna()
df.shape

(900, 3)

# Lowercasing


In [8]:
df['Review_Text'] = df['Review_Text'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Review_Text'] = df['Review_Text'].str.lower()


In [9]:
df

Unnamed: 0,Review_Text,Rating,Sentiment
0,i have mixed feelings about my experience.,4,1
1,the staff was caring and attentive. i couldn't...,5,1
2,i have mixed feelings about my experience.,5,1
3,i have mixed feelings about my experience.,5,1
4,the healthcare provider was excellent. i had a...,3,0
...,...,...,...
995,my experience was terrible. i would not recomm...,5,1
996,the service was disappointing. i won't be comi...,4,1
997,"the service was okay, but nothing exceptional.",3,0
998,i have mixed feelings about my experience.,5,1


# Removing Special Characters

In [10]:
import re
df['cleaned_text'] = df['Review_Text'].apply(lambda x: re.sub(r'[^\w\s]','',x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = df['Review_Text'].apply(lambda x: re.sub(r'[^\w\s]','',x))


In [11]:
df

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...
...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience


# Tokenization

In [12]:
df['tokenized_reviews'] = df.apply(lambda row: nltk.word_tokenize(row['Review_Text']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tokenized_reviews'] = df.apply(lambda row: nltk.word_tokenize(row['Review_Text']), axis=1)


In [26]:
# df = df.drop('tokenized_reviwes', axis=1)

In [13]:
df

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...,"[the, staff, was, caring, and, attentive, ., i..."
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...,"[the, healthcare, provider, was, excellent, .,..."
...,...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[my, experience, was, terrible, ., i, would, n..."
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[the, service, was, disappointing, ., i, wo, n..."
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[the, service, was, okay, ,, but, nothing, exc..."
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."


In [28]:
df

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...,"[the, staff, was, caring, and, attentive, ., i..."
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...,"[the, healthcare, provider, was, excellent, .,..."
...,...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[my, experience, was, terrible, ., i, would, n..."
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[the, service, was, disappointing, ., i, wo, n..."
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[the, service, was, okay, ,, but, nothing, exc..."
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien..."


#  Removing Stopwords

In [14]:
stop_words = set(stopwords.words('english'))

df['stopwords_reviews'] = df['tokenized_reviews'].apply(lambda x: [word for word in x if word not in stop_words])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['stopwords_reviews'] = df['tokenized_reviews'].apply(lambda x: [word for word in x if word not in stop_words])


In [15]:
df

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews,stopwords_reviews
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]"
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...,"[the, staff, was, caring, and, attentive, ., i...","[staff, caring, attentive, ., could, n't, happ..."
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]"
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]"
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...,"[the, healthcare, provider, was, excellent, .,...","[healthcare, provider, excellent, ., great, ex..."
...,...,...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[my, experience, was, terrible, ., i, would, n...","[experience, terrible, ., would, recommend, pr..."
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[the, service, was, disappointing, ., i, wo, n...","[service, disappointing, ., wo, n't, coming, b..."
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[the, service, was, okay, ,, but, nothing, exc...","[service, okay, ,, nothing, exceptional, .]"
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]"


# Lemmatization 

In [16]:
lemmatizer = WordNetLemmatizer()

df['lemmatized'] = df['stopwords_reviews'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lemmatized'] = df['stopwords_reviews'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [17]:
df


Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews,stopwords_reviews,lemmatized
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]","[mixed, feeling, experience, .]"
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...,"[the, staff, was, caring, and, attentive, ., i...","[staff, caring, attentive, ., could, n't, happ...","[staff, caring, attentive, ., could, n't, happ..."
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]","[mixed, feeling, experience, .]"
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]","[mixed, feeling, experience, .]"
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...,"[the, healthcare, provider, was, excellent, .,...","[healthcare, provider, excellent, ., great, ex...","[healthcare, provider, excellent, ., great, ex..."
...,...,...,...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[my, experience, was, terrible, ., i, would, n...","[experience, terrible, ., would, recommend, pr...","[experience, terrible, ., would, recommend, pr..."
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[the, service, was, disappointing, ., i, wo, n...","[service, disappointing, ., wo, n't, coming, b...","[service, disappointing, ., wo, n't, coming, b..."
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[the, service, was, okay, ,, but, nothing, exc...","[service, okay, ,, nothing, exceptional, .]","[service, okay, ,, nothing, exceptional, .]"
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]","[mixed, feeling, experience, .]"


#  Rejoin Processed Tokens

In [18]:
df['processed_text'] = df['lemmatized'].apply(lambda x: ' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['processed_text'] = df['lemmatized'].apply(lambda x: ' '.join(x))


In [19]:
df

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews,stopwords_reviews,lemmatized,processed_text
0,i have mixed feelings about my experience.,4,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]","[mixed, feeling, experience, .]",mixed feeling experience .
1,the staff was caring and attentive. i couldn't...,5,1,the staff was caring and attentive i couldnt b...,"[the, staff, was, caring, and, attentive, ., i...","[staff, caring, attentive, ., could, n't, happ...","[staff, caring, attentive, ., could, n't, happ...",staff caring attentive . could n't happier .
2,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]","[mixed, feeling, experience, .]",mixed feeling experience .
3,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]","[mixed, feeling, experience, .]",mixed feeling experience .
4,the healthcare provider was excellent. i had a...,3,0,the healthcare provider was excellent i had a ...,"[the, healthcare, provider, was, excellent, .,...","[healthcare, provider, excellent, ., great, ex...","[healthcare, provider, excellent, ., great, ex...",healthcare provider excellent . great experien...
...,...,...,...,...,...,...,...,...
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[my, experience, was, terrible, ., i, would, n...","[experience, terrible, ., would, recommend, pr...","[experience, terrible, ., would, recommend, pr...",experience terrible . would recommend provider .
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[the, service, was, disappointing, ., i, wo, n...","[service, disappointing, ., wo, n't, coming, b...","[service, disappointing, ., wo, n't, coming, b...",service disappointing . wo n't coming back .
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[the, service, was, okay, ,, but, nothing, exc...","[service, okay, ,, nothing, exceptional, .]","[service, okay, ,, nothing, exceptional, .]","service okay , nothing exceptional ."
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]","[mixed, feeling, experience, .]",mixed feeling experience .


#  Encoding Labels

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['sentiment_lable'] = le.fit_transform(df['Sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_lable'] = le.fit_transform(df['Sentiment'])


In [25]:
df.tail()

Unnamed: 0,Review_Text,Rating,Sentiment,cleaned_text,tokenized_reviews,stopwords_reviews,lemmatized,processed_text,sentiment_lable
995,my experience was terrible. i would not recomm...,5,1,my experience was terrible i would not recomme...,"[my, experience, was, terrible, ., i, would, n...","[experience, terrible, ., would, recommend, pr...","[experience, terrible, ., would, recommend, pr...",experience terrible . would recommend provider .,2
996,the service was disappointing. i won't be comi...,4,1,the service was disappointing i wont be coming...,"[the, service, was, disappointing, ., i, wo, n...","[service, disappointing, ., wo, n't, coming, b...","[service, disappointing, ., wo, n't, coming, b...",service disappointing . wo n't coming back .,2
997,"the service was okay, but nothing exceptional.",3,0,the service was okay but nothing exceptional,"[the, service, was, okay, ,, but, nothing, exc...","[service, okay, ,, nothing, exceptional, .]","[service, okay, ,, nothing, exceptional, .]","service okay , nothing exceptional .",1
998,i have mixed feelings about my experience.,5,1,i have mixed feelings about my experience,"[i, have, mixed, feelings, about, my, experien...","[mixed, feelings, experience, .]","[mixed, feeling, experience, .]",mixed feeling experience .,2
999,the staff was caring and attentive. i couldn't...,4,1,the staff was caring and attentive i couldnt b...,"[the, staff, was, caring, and, attentive, ., i...","[staff, caring, attentive, ., could, n't, happ...","[staff, caring, attentive, ., could, n't, happ...",staff caring attentive . could n't happier .,2


# Splitting Data

In [34]:
from sklearn.model_selection import train_test_split

X = df['processed_text'] # Features: the cleaned reviews
y = df['Sentiment'] # Lables: sentiment scores

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state=42)

In [35]:
print('X_train= ' + str(len(y_train)))
print('X_test= ' + str(len(X_test)))

X_train= 675
X_test= 225


# Feature Extraction (Vectorization)
Bag of Words (BoW): Represents text as a frequency of words.

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=1000)
X_train_vactorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.fit_transform(X_test)

In [51]:
X_test_vectorized

<225x33 sparse matrix of type '<class 'numpy.int64'>'
	with 1061 stored elements in Compressed Sparse Row format>

TF-IDF (Term Frequency-Inverse Document Frequency): 
Adjusts word frequency by how commonly it appears across documents.

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.fit_transform(X_test)

# Word Embeddings (Optional)
More advanced models like Word2Vec or BERT use word embeddings to capture semantic meaning. I can train embeddings or use pre-trained ones for better results.

In [45]:
! pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl.metadata (8.2 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.0.5-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.3.3-cp312-cp312-win_amd64.whl (24.0 MB)
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/24.0 MB ? eta -:--:--
   ---------------------------------------- 0.1/24.0 MB 1.1 MB/s eta 0:00:22
   ---------------------------------------- 0.2/24.0 MB 2.8 MB/s eta 0:00:09
    --------------------------------------- 0.5/24.0 MB 3.3 MB/s eta 0:00:08
   - -------------------------------------- 0.9/24.0 MB 5.2 MB/s eta 0:00:05
   - -------------------------------------- 1.2/24.0 MB 5.0 MB/s eta 0:00:05
   -- ------------------------------------- 1.3/24.0 MB 4.6 MB/s eta 0:00:05
   -- ------------------------------------- 1.4/24.0 MB 4.7 MB/s eta 0:00:05
   -- ------------------------------------- 1.7/24.0 MB 4.7 MB/


[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [46]:
# Example with Word2Vec (requires more setup and time)

from gensim.models import Word2Vec

model = Word2Vec(df['lemmatized'], vector_size=100, window=5, min_count=1, workers=4)


# Building the Sentiment Analysis Model

After preprocessing and vectorizing the data, I can use various machine learning models, such as Logistic Regression, Naive Bayes
for sentiment classification.

# Logistic Regression

In [47]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = model.predict(X_test_tfidf)

In [48]:
y_pred

array(['1', '1', '-1', '-1', '1', '1', '-1', '-1', '1', '1', '1', '-1',
       '-1', '-1', '-1', '1', '1', '1', '-1', '1', '-1', '1', '1', '1',
       '-1', '-1', '-1', '1', '-1', '-1', '1', '1', '-1', '1', '-1', '1',
       '1', '-1', '1', '-1', '1', '1', '-1', '-1', '-1', '1', '1', '-1',
       '1', '1', '-1', '1', '1', '1', '1', '-1', '1', '-1', '-1', '-1',
       '1', '1', '-1', '1', '1', '1', '1', '-1', '-1', '1', '1', '-1',
       '-1', '1', '-1', '-1', '-1', '-1', '1', '1', '1', '1', '-1', '-1',
       '-1', '1', '-1', '-1', '1', '1', '1', '-1', '-1', '1', '1', '1',
       '-1', '1', '-1', '1', '1', '-1', '1', '1', '1', '-1', '1', '-1',
       '-1', '1', '1', '-1', '1', '1', '1', '-1', '1', '-1', '-1', '1',
       '1', '1', '-1', '-1', '1', '1', '1', '1', '1', '-1', '-1', '1',
       '1', '1', '1', '1', '1', '-1', '1', '1', '1', '-1', '1', '1', '-1',
       '1', '1', '1', '-1', '-1', '-1', '-1', '1', '1', '1', '1', '1',
       '-1', '-1', '1', '1', '1', '-1', '-1', '1', '-1', '-

# Naive Bayes

In [49]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = nb_model.predict(X_test_tfidf)

# Evaluating the Model
Evaluate the model's performance using metrics like accuracy, precision, recall, and F1-score.

In [53]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

          -1       0.44      0.44      0.44        96
           0       1.00      0.00      0.00        31
           1       0.41      0.54      0.46        98

    accuracy                           0.42       225
   macro avg       0.62      0.33      0.30       225
weighted avg       0.50      0.42      0.39       225

