<a href="https://colab.research.google.com/github/winnie2qui0/Wellnest_frontend/blob/main/sentimentalAnalyzing_bertModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Importing Libraries and Reading Data

In [None]:
pip install tensorflow==2.15.0 tensorflow-text==2.15.0 tensorflow-hub==0.16.1



In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from wordcloud import WordCloud
import spacy
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.utils import resample
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'
import warnings
warnings.filterwarnings('ignore')

In [None]:

df=pd.read_csv('Sentiment Data.csv')
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [None]:
print('The datatype and other basic info about the dataset are as follows:')
df.info()
print('*'*50)
print(f'The shape of the dataset is: {df.shape}')
print('*'*50)
print(f'The columns in the dataset are: {df.columns}')

The datatype and other basic info about the dataset are as follows:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93043 entries, 0 to 93042
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   statement  92681 non-null  object
 1   status     93043 non-null  object
dtypes: object(2)
memory usage: 1.4+ MB
**************************************************
The shape of the dataset is: (93043, 2)
**************************************************
The columns in the dataset are: Index(['statement', 'status'], dtype='object')


1. Both 'statement' and 'status' are of datatype 'O'.
2. Column 'Unnamed: 0' seems unwanted.
3. There seems null values in 'statement'.

In [None]:
df = df.drop(df[df['status'] == 'Normal'].index)
df.reset_index(drop=True)
df.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


# 2. EDA and Data Preparation

In [None]:
for feature in df.columns:
    if df[feature].isnull().sum() >1:
        print(feature)
        print(f'The count of null values for is: {df[feature].isnull().sum()}')
        print(f'The percentage of null values is: {np.round(df[feature].isnull().mean(),4)}%')
        print('*'*50)

statement
The count of null values for is: 354
The percentage of null values is: 0.0046%
**************************************************


From the analysis it's clear that NaN values don't have much impact and can be dropped.

In [None]:
df.dropna(inplace = True)
df.isnull().sum()

Unnamed: 0,0
statement,0
status,0


1. Majority of the population do not suffer from mental health issues.
2. Among the people suffering from mental health issues, depression and suidical is most common.
3. There can be relation between depression and suicide as most people suffering depression tend to get suicidal thoughts.

In [None]:
df['statement_length'] = df['statement'].apply(len)
df['num_words'] = df['statement'].apply(lambda x: len(x.split()))
df['avg_word_length'] = np.round(df['statement_length'] / df['num_words'])
df['avg_word_length'] = df['avg_word_length'].map(int)
df['vocabulary_size'] = df['statement'].apply(lambda x: len(set(x.split())))
df.head()

Unnamed: 0,statement,status,statement_length,num_words,avg_word_length,vocabulary_size
0,oh my gosh,Anxiety,10,3,3,3
1,"trouble sleeping, confused mind, restless hear...",Anxiety,64,10,6,10
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,78,14,6,13
3,I've shifted my focus to something else but I'...,Anxiety,61,11,6,11
4,"I'm restless and restless, it's been a month n...",Anxiety,72,14,5,14


In [None]:
num_feature = [feature for feature in df.columns if df[feature].dtype != 'O']

num_feature

['statement_length', 'num_words', 'avg_word_length', 'vocabulary_size']

It clearly seems like normal people speak the least since they don't have any mental health issues to explain.

In [None]:
for feature in num_feature:
    df[feature] = np.log(df[feature])

df.head()

Unnamed: 0,statement,status,statement_length,num_words,avg_word_length,vocabulary_size
0,oh my gosh,Anxiety,2.302585,1.098612,1.098612,1.098612
1,"trouble sleeping, confused mind, restless hear...",Anxiety,4.158883,2.302585,1.791759,2.302585
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,4.356709,2.639057,1.791759,2.564949
3,I've shifted my focus to something else but I'...,Anxiety,4.110874,2.397895,1.791759,2.397895
4,"I'm restless and restless, it's been a month n...",Anxiety,4.276666,2.639057,1.609438,2.639057


In [None]:
#Converting Labels into numbers

label_ordered = df['status'].value_counts().index

label_ordered = {k:i for i,k in enumerate(label_ordered,0)}

df['status'] = df['status'].map(label_ordered)
df.head()

Unnamed: 0,statement,status,statement_length,num_words,avg_word_length,vocabulary_size
0,oh my gosh,7,2.302585,1.098612,1.098612,1.098612
1,"trouble sleeping, confused mind, restless hear...",7,4.158883,2.302585,1.791759,2.302585
2,"All wrong, back off dear, forward doubt. Stay ...",7,4.356709,2.639057,1.791759,2.564949
3,I've shifted my focus to something else but I'...,7,4.110874,2.397895,1.791759,2.397895
4,"I'm restless and restless, it's been a month n...",7,4.276666,2.639057,1.609438,2.639057


# 3. Modelling

In [None]:
df_sample = df.sample(n=20000,random_state=2024)
majority_size = len(df_sample[df_sample['status']==0])

def resampling(df):
  minority_resample = resample(df,replace=True, n_samples=majority_size,random_state=22)
  return minority_resample

In [None]:
df_sample_1 = resampling(df_sample[df_sample['status']==1])
df_sample_2 = resampling(df_sample[df_sample['status']==2])
df_sample_3 = resampling(df_sample[df_sample['status']==3])
df_sample_4 = resampling(df_sample[df_sample['status']==4])
df_sample_5 = resampling(df_sample[df_sample['status']==5])
df_sample_6 = resampling(df_sample[df_sample['status']==6])
df_sample_7 = resampling(df_sample[df_sample['status']==7])
df_sample_8 = resampling(df_sample[df_sample['status']==8])
df_sample_9 = resampling(df_sample[df_sample['status']==9])
df_sample_10 = resampling(df_sample[df_sample['status']==10])
df_sample_11 = resampling(df_sample[df_sample['status']==11])
df_sample_12 = resampling(df_sample[df_sample['status']==12])
df_sample_13 = resampling(df_sample[df_sample['status']==13])
df_sample_14 = resampling(df_sample[df_sample['status']==14])
df_sample_15 = resampling(df_sample[df_sample['status']==15])
df_sample_16 = resampling(df_sample[df_sample['status']==16])
df_sample_17 = resampling(df_sample[df_sample['status']==17])
df_sample_18 = resampling(df_sample[df_sample['status']==18])

df_new =pd.concat([df_sample[df_sample['status']==0],df_sample_1,df_sample_2,df_sample_3,df_sample_4,df_sample_5,df_sample_6,df_sample_7,df_sample_8,df_sample_9,df_sample_10,df_sample_11,df_sample_12,df_sample_13,df_sample_14,df_sample_15,df_sample_16,df_sample_17,df_sample_18], axis=0).reset_index(drop=True)
df_new['status'].value_counts()


Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
0,4026
10,4026
17,4026
16,4026
15,4026
14,4026
13,4026
12,4026
11,4026
9,4026


In [None]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")


text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)


embedding_model = tf.keras.Model(inputs=[text_input], outputs=[outputs['pooled_output']])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_new['statement'], df_new['status'], test_size=0.2, random_state=42)


# Create tf.data.Dataset
batch_size = 64  # Adjust this based on your memory constraints
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(batch_size)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(batch_size)

# Compute embeddings in batches
train_embeddings = []
for batch in train_dataset:
    batch_embeddings = embedding_model.predict(batch[0])
    train_embeddings.append(batch_embeddings)
train_embeddings = np.vstack(train_embeddings)

test_embeddings = []
for batch in test_dataset:
    batch_embeddings = embedding_model.predict(batch[0])
    test_embeddings.append(batch_embeddings)
test_embeddings = np.vstack(test_embeddings)

In [None]:
xgb = XGBClassifier(alpha=0.5, lambda_=1.0,learning_rate=0.05, n_estimators=700)
model = xgb.fit(train_embeddings, y_train,
                eval_set=[(test_embeddings, y_test)])
y_pred = model.predict(test_embeddings)

# 4. Results

In [None]:
sns.heatmap(confusion_matrix(y_test,y_pred), annot =True, fmt='d')
print(classification_report(y_test,y_pred))

In [None]:
from joblib import dump, load
dump(model, 'model.joblib')
from google.colab import files
files.download('model.joblib')
