In [20]:
#!pip install nlpaug

In [21]:
import pandas as pd
import random
import nlpaug.augmenter.word as naw

# Load your dataset
df = pd.read_csv('emotions_data_expanded.csv')


In [22]:
df.head()

Unnamed: 0,PatientID,Question No,Emotion1,Emotion2,Emotion3,Emotion4,Emotion5,Emotion6,Emotion7,Emotion8,Emotion9,Emotion10,Sentiment,GADscore,HeartRate,OxygenLevel
0,65d73565463b536e04d9a325,1,neutral,neutral,neutral,sad,neutral,neutral,neutral,neutral,neutral,neutral,neutral,7,90,119
1,65d73565463b536e04d9a325,2,neutral,fear,neutral,neutral,neutral,neutral,sad,neutral,neutral,sad,neutral,7,97,81
2,65d73565463b536e04d9a325,3,neutral,neutral,neutral,neutral,neutral,neutral,neutral,neutral,happy,happy,neutral,7,111,83
3,65d73565463b536e04d9a325,4,neutral,neutral,neutral,neutral,fear,angry,fear,neutral,neutral,neutral,neutral,7,89,92
4,65d73565463b536e04d9a325,5,happy,neutral,angry,angry,happy,happy,angry,neutral,neutral,neutral,neutral,7,105,94


In [23]:

emotions = ['Emotion1', 'Emotion2', 'Emotion3', 'Emotion4', 'Emotion5', 
            'Emotion6', 'Emotion7', 'Emotion8', 'Emotion9', 'Emotion10']

gad_min, gad_max = 70, 130  
heart_rate_min, heart_rate_max = 70, 120  
oxygen_level_min, oxygen_level_max = 80, 100 

augmented_datasets = []

for _ in range(25):
    augmented_rows = []
    for _, row in df.iterrows():
        # Shuffle emotions
        shuffled_emotions = row[emotions].tolist()
        random.shuffle(shuffled_emotions)
        
        gad_score = random.randint(gad_min, gad_max)
        heart_rate = random.randint(heart_rate_min, heart_rate_max)
        oxygen_level = random.randint(oxygen_level_min, oxygen_level_max)
        
        # Update row with augmented emotions and numerical features
        row.update(pd.Series(shuffled_emotions, index=emotions))
        row['GADscore'] = gad_score
        row['HeartRate'] = heart_rate
        row['OxygenLevel'] = oxygen_level
        
        augmented_rows.append(row)
    
    # Create a new DataFrame with augmented emotions and numerical features
    augmented_df = pd.DataFrame(augmented_rows)
    
    # Append the augmented dataset to the list
    augmented_datasets.append(augmented_df)

# Concatenate all augmented datasets
final_dataset = pd.concat(augmented_datasets, ignore_index=True)

# Save the final dataset to a new CSV file
final_dataset.to_csv('augmented_dataset.csv', index=False)


In [24]:
df = pd.read_csv('augmented_dataset.csv')
df.head()
df.describe()

Unnamed: 0,Question No,GADscore,HeartRate,OxygenLevel
count,6000.0,6000.0,6000.0,6000.0
mean,5.5,100.0075,95.043167,89.949
std,2.872521,17.696515,14.677586,6.014241
min,1.0,70.0,70.0,80.0
25%,3.0,84.0,82.75,85.0
50%,5.5,100.0,95.0,90.0
75%,8.0,115.0,108.0,95.0
max,10.0,130.0,120.0,100.0


In [25]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5965 entries, 0 to 5998
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PatientID    5965 non-null   object
 1   Question No  5965 non-null   int64 
 2   Emotion1     5965 non-null   object
 3   Emotion2     5965 non-null   object
 4   Emotion3     5965 non-null   object
 5   Emotion4     5965 non-null   object
 6   Emotion5     5965 non-null   object
 7   Emotion6     5965 non-null   object
 8   Emotion7     5965 non-null   object
 9   Emotion8     5965 non-null   object
 10  Emotion9     5965 non-null   object
 11  Emotion10    5965 non-null   object
 12  Sentiment    5965 non-null   object
 13  GADscore     5965 non-null   int64 
 14  HeartRate    5965 non-null   int64 
 15  OxygenLevel  5965 non-null   int64 
dtypes: int64(4), object(12)
memory usage: 792.2+ KB


In [27]:
import pandas as pd

# Load your dataset
df = pd.read_csv('augmented_dataset.csv')

# Define weights for each feature
weights = {
    'Emotion1': {'fear': 2, 'angry': 1, 'neutral': 0, 'happy': 0, 'sad': 0, 'surprise' : 1},
    'Emotion2': {'fear': 2, 'angry': 1, 'neutral': 0, 'happy': 0, 'sad': 0, 'surprise' : 1},
    'Emotion3': {'fear': 2, 'angry': 1, 'neutral': 0, 'happy': 0, 'sad': 0, 'surprise' : 1},
    'Emotion4': {'fear': 2, 'angry': 1, 'neutral': 0, 'happy': 0, 'sad': 0, 'surprise' : 1},
    'Emotion5': {'fear': 2, 'angry': 1, 'neutral': 0, 'happy': 0, 'sad': 0, 'surprise' : 1},
    'Emotion6': {'fear': 2, 'angry': 1, 'neutral': 0, 'happy': 0, 'sad': 0, 'surprise' : 1},
    'Emotion7': {'fear': 2, 'angry': 1, 'neutral': 0, 'happy': 0, 'sad': 0, 'surprise' : 1},
    'Emotion8': {'fear': 2, 'angry': 1, 'neutral': 0, 'happy': 0, 'sad': 0, 'surprise' : 1},
    'Emotion9': {'fear': 2, 'angry': 1, 'neutral': 0, 'happy': 0, 'sad': 0, 'surprise' : 1},
    'Emotion10': {'fear': 2, 'angry': 1, 'neutral': 0, 'happy': 0, 'sad': 0, 'surprise' : 1},
    'Sentiment': {'positive': 1, 'negative': 1, 'neutral': 0},
    'GADscore': {'low': 0, 'high': 2},  
    'HeartRate': {'low': 0, 'high': 2}, 
    'OxygenLevel': {'low': 0, 'high': 2}
}

# Define a function to determine if a person has anxiety based on emotions and other features
def has_anxiety(row):
    total_weight = sum(weights[col].get(row[col], 0) for col in weights.keys())
    return total_weight > 0  # Considered anxious if total weight is greater than 0


# Apply the function to each row to determine anxiety status
df['Anxiety'] = df.apply(has_anxiety, axis=1)

# Convert boolean values to 'Yes' and 'No'
df['Anxiety'] = df['Anxiety'].map({True: 'Yes', False: 'No'})

# Save the dataset with the anxiety column to a new CSV file
df.to_csv('dataset_with_anxiety.csv', index=False)



In [28]:
df = pd.read_csv('dataset_with_anxiety.csv')
df.head()
df.describe()

Unnamed: 0,Question No,GADscore,HeartRate,OxygenLevel
count,6000.0,6000.0,6000.0,6000.0
mean,5.5,100.0075,95.043167,89.949
std,2.872521,17.696515,14.677586,6.014241
min,1.0,70.0,70.0,80.0
25%,3.0,84.0,82.75,85.0
50%,5.5,100.0,95.0,90.0
75%,8.0,115.0,108.0,95.0
max,10.0,130.0,120.0,100.0


In [29]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df.info()

# Convert to csv

df.to_csv('final_dataset.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 5965 entries, 0 to 5998
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   PatientID    5965 non-null   object
 1   Question No  5965 non-null   int64 
 2   Emotion1     5965 non-null   object
 3   Emotion2     5965 non-null   object
 4   Emotion3     5965 non-null   object
 5   Emotion4     5965 non-null   object
 6   Emotion5     5965 non-null   object
 7   Emotion6     5965 non-null   object
 8   Emotion7     5965 non-null   object
 9   Emotion8     5965 non-null   object
 10  Emotion9     5965 non-null   object
 11  Emotion10    5965 non-null   object
 12  Sentiment    5965 non-null   object
 13  GADscore     5965 non-null   int64 
 14  HeartRate    5965 non-null   int64 
 15  OxygenLevel  5965 non-null   int64 
 16  Anxiety      5965 non-null   object
dtypes: int64(4), object(13)
memory usage: 838.8+ KB


In [30]:
import pandas as pd

# Load your dataset
df = pd.read_csv('final_dataset.csv')

# Define the columns to check for neutral values
columns_to_check = ['Emotion1', 'Emotion2', 'Emotion3', 'Emotion4', 'Emotion5', 'Emotion6', 'Emotion7', 'Emotion8', 'Emotion9', 'Emotion10', 'Sentiment']

# Find rows where all specified columns have the value 'neutral'
neutral_rows = df[(df[columns_to_check] == 'neutral').all(axis=1)]

# Remove the neutral rows from the DataFrame
df_cleaned = df.drop(neutral_rows.index)

# Save the cleaned DataFrame if needed
df_cleaned.to_csv('cleaned_dataset.csv', index=False)


In [31]:
df = pd.read_csv('cleaned_dataset.csv')
#tell how much time the emotion is repeated in this csv
df['Emotion1'].value_counts()
df['Emotion2'].value_counts()
df['Emotion3'].value_counts()
df['Emotion4'].value_counts()
df['Emotion5'].value_counts()
df['Emotion6'].value_counts()
df['Emotion7'].value_counts()
df['Emotion8'].value_counts()
df['Emotion9'].value_counts()
df['Emotion10'].value_counts()



Emotion10
neutral     2173
sad          685
fear         364
angry        340
happy        331
surprise      74
Name: count, dtype: int64

In [32]:
df = pd.read_csv('cleaned_dataset.csv')
df.head()

#check frequecy of yes or no
df['Anxiety'].value_counts()



Anxiety
Yes    2627
No     1340
Name: count, dtype: int64

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('cleaned_dataset.csv')

# Drop non-numeric columns and target column
X = df.drop(columns=['PatientID', 'Question No', 'Anxiety'])
y = df['Anxiety']

# Perform one-hot encoding on categorical columns
encoder = OneHotEncoder(drop='first')
X_encoded = encoder.fit_transform(X.select_dtypes(include=['object']))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [34]:


# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)




Accuracy: 0.8438287153652393


In [35]:
#Training the model
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, pos_label='Yes')
print("F1 Score:", f1)

# Calculate the precision
precision = precision_score(y_test, y_pred, pos_label='Yes')
print("Precision:", precision)

# Generate classification report
class_report = classification_report(y_test, y_pred)
print("Classification Report:")
print(class_report)

# Assuming df contains your dataset
X = df.drop(columns=['PatientID', 'Question No', 'Anxiety'])  # Features
y = df['Anxiety']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer for one-hot encoding
categorical_cols = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), categorical_cols)],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model on the training set
model.fit(X_train_encoded, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test_encoded)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Validation Accuracy:", accuracy)


Accuracy: 0.8438287153652393
Confusion Matrix:
[[197  71]
 [ 53 473]]
F1 Score: 0.8841121495327102
Precision: 0.8694852941176471
Classification Report:
              precision    recall  f1-score   support

          No       0.79      0.74      0.76       268
         Yes       0.87      0.90      0.88       526

    accuracy                           0.84       794
   macro avg       0.83      0.82      0.82       794
weighted avg       0.84      0.84      0.84       794

Validation Accuracy: 0.9685138539042821


In [36]:
#Testing the model
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import pandas as pd

# Load your dataset
df = pd.read_csv('cleaned_dataset.csv')

# Assuming df contains your dataset
X = df.drop(columns=['PatientID', 'Question No', 'Anxiety'])  # Features
y = df['Anxiety']  # Target variable

# Split the dataset into training, validation, and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

# Define the column transformer for one-hot encoding
categorical_cols = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), categorical_cols)],
    remainder='passthrough'
)

# Fit and transform the training data
X_train_encoded = preprocessor.fit_transform(X_train)
X_val_encoded = preprocessor.transform(X_val)
X_test_encoded = preprocessor.transform(X_test)

# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model on the training set
model.fit(X_train_encoded, y_train)

# Make predictions on the training set
y_pred_train = model.predict(X_train_encoded)

# Calculate the training accuracy
train_accuracy = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", train_accuracy)

# Make predictions on the validation set
y_pred_val = model.predict(X_val_encoded)

# Calculate the validation accuracy
val_accuracy = accuracy_score(y_val, y_pred_val)
print("Validation Accuracy:", val_accuracy)

# Make predictions on the testing set
y_pred_test = model.predict(X_test_encoded)

# Calculate the testing accuracy
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Testing Accuracy:", test_accuracy)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred_test)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred_test, pos_label='Yes')
print("F1 Score:", f1)

# Calculate the precision
precision = precision_score(y_test, y_pred_test, pos_label='Yes')
print("Precision:", precision)

# Generate classification report
class_report = classification_report(y_test, y_pred_test)
print("Classification Report:")
print(class_report)


Training Accuracy: 0.9676334594367382
Validation Accuracy: 0.9760705289672544
Testing Accuracy: 0.9659949622166247
Confusion Matrix:
[[268   0]
 [ 27 499]]
F1 Score: 0.9736585365853657
Precision: 1.0
Classification Report:
              precision    recall  f1-score   support

          No       0.91      1.00      0.95       268
         Yes       1.00      0.95      0.97       526

    accuracy                           0.97       794
   macro avg       0.95      0.97      0.96       794
weighted avg       0.97      0.97      0.97       794



In [37]:
# Collect input data from the user
input_data = {
    'Emotion1': 'sad',
    'Emotion2': 'happy',
    'Emotion3': 'happy',
    'Emotion4': 'sad',
    'Emotion5': 'happy',
    'Emotion6': 'happy',
    'Emotion7': 'happy',
    'Emotion8': 'happy',
    'Emotion9': 'neutral',
    'Emotion10': 'happy',
    'Sentiment': 'neutral',
    'GADscore': 104,
    'HeartRate': 76,
    'OxygenLevel': 84
}

# Create a DataFrame from the input data
input_df = pd.DataFrame([input_data])

# Preprocess the input data
input_encoded = preprocessor.transform(input_df)

# Make predictions using the trained model
prediction = model.predict(input_encoded)

# Display the prediction
print("Prediction:", prediction[0])


Prediction: No
