# Data variables

In [None]:
pathdf = r'content/combined_lagEDA.csv' # Main Dataset Path 
Yvar = 'Stress' # Here is the name of the variable Y to predict 
aggtype = ['mean', 'std'] #group ways std
palette = 'flare'
seed = 49
test_size = 0.30 #This is %size of test of this project 

## libraries   

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [None]:
#Here is Color Plette 
sns.color_palette(palette, 10)

## Overview

In [None]:
#Here is my data load function using the pandas library, i Loaded 12445 rows and 49 columns
df = pd.read_csv(pathdf)  
print(df)

In [None]:
#here is all data of this data set 5 rows and 49 columns but I will be work only specific data, I have loaded this project data next df.head().
df.head()  

## Secondary Data for this project

In [None]:
# Here are my selected only the desired columns for this project below and update data frame df will load future function 
selected_columns = ['EDAR_Mean', 'EDAR_Std', 'Num_PeaksR', 'HRR_Mean', 'HRR_Std', 'TEMPR_Mean', 'TEMPR_Std', 'Stress']
df = df[selected_columns]
df.head()

In [None]:
#Here is totale data rows and columns 
df.shape 

## Descriptives statistics of the data

### Univariate

In [None]:
#Here is data describe for this project. 
df.describe().T

In [None]:
#Here is check the number of missing (NaN) values for each column in this DataFrame,
df.isna().sum()

In [None]:
# Display a summary of the DataFrame
df.info()

In [None]:
# Here is assuming the DataFrame 'df' and group by column 'Yvar' and apply aggregation function(s) from 'aggtype'
df.groupby(Yvar).agg(aggtype)

### Data Bivariates Stap

In [None]:
# Here is assuming 'df' this DataFrame
df.corr()

In [None]:
# I have assuming this DataFrame 'df' and a column name for grouping 'Yvar'
df.groupby(Yvar).corr() 

## Visualization step 

In [None]:
# This is custom figure size of 15x8 inches code
plt.figure(figsize=(15, 8))

# This is heatmap of the correlation matrix for the DataFrame `df`
# Display the correlation values inside each square cell, rounded to one decimal place
sns.heatmap(df.corr(), annot=True, square=True, fmt='.1f')

# title 'Correlation' to the plot with a font size of 20
plt.title('Correlation', fontsize=20)

# Display the plot
plt.show()


In [None]:
#Here is pairplot using Seaborn with specific figure size 15x8, hue, and color palette 
plt.figure(figsize = (15,8))
sns.pairplot(df, hue=Yvar, palette=palette)

In [None]:
#here are histograms by iterating over the columns and plotting them individually
df.columns = ['EDAR_Mean', 'EDAR_Std', 'Num_peaksR', 'HRR_Mean', 'HRR_Std', 'TEMPR_Mean', 'TEMPR_Std', 'Stress']
colors = ['#eab889', '#8cd0eb', '#e28389', '#7ebf88', '#d989e2', '#bfe288', '#89d0e2']

data = df.copy()
data.drop('Stress', axis=1, inplace=True)

#This is figure size for the plt 25x15 and font size 18
plt.figure(figsize=(25, 15))
plt.rcParams['font.size'] = 18


#The for loop iterates through each column name and its corresponding color from the data.columns and colors lists.
for idx, (column, color) in enumerate(zip(data.columns, colors)):
    plt.subplot(3, 3, idx + 1)
    sns.histplot(data[column], bins=15, color=color, kde=True)
    plt.title(column)
plt.tight_layout()
plt.show()


In [None]:
#Now I am doing  assuming 'data' in this DataFrame figure size 14x14 
data.plot(kind='box', subplots=True, layout=(2, 14), figsize=(14, 14), sharex=False, sharey=False)

# Adjust subplot spacing
plt.subplots_adjust(left=0.1,
                    bottom=0.1, 
                    right=2, 
                    top=0.9, 
                    wspace=0.4, 
                    hspace=0.4)
# Show the plot
plt.show()


In [None]:
#lets check count of Stress
sns.countplot(df["Stress"])
plt.show()

## The Classification

### Exploration of this model 

Simply attempting to implement an ML algorithm (without optimization or an ideal selection)

#### K-NN
- **Keyword** : straightforward, comprehensible, flexible, ML
- **utilization** : detecting handwriting, recognizing images, identifying video content
- **Strengths**: valuable when labeled data is costly or unattainable, provides high accuracy
- **Weaknesses**: struggles with a large number of features (requires more data, leading to a risk of overfitting)

In this case, it's suitable since we don't have an excessive number of features.

In [None]:
#Prepiaring the feature matrix (X) and target vector (Y) by dropping the target variable (Yvar) from the DataFrame (df) of this the 
X = df.drop(Yvar, axis=1)
Y = df[Yvar]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42) #test_size=0.2: 20% of the data is used for testing

In [None]:
#Now initialize a K-Nearest Neighbors (KNN) classifier with 3 nearest neighbors 
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
#data training 
knn.fit(X_train, y_train)

In [None]:
#here is my first data training accuact score knn
accuracy_score(y_test,knn.predict(X_test))

#### KNN with Feature Selection
KNN performs better with fewer features, especially since the features in this dataset appear to be correlated. To reduce the dimensionality, we will use Recursive Feature Selection (RFS) to decrease the number of features to 2.

In [None]:
# Now I am going to initialize the Recursive Feature Elimination (RFE) object with the DecisionTreeClassifier as the estimator
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=2) # The goal is to select the top 2 features from the dataset

In [None]:
# It will be creat pipeline with RFE for feature selection and KNN for classification 
pipeline = Pipeline(steps=[('RFE', rfe), ('KNN', knn)])

In [None]:
# Need to fit the pipeline to the training of this data
pipeline.fit(X_train, y_train)

In [None]:
#here is my secound data training accuact score knn
accuracy_score(y_test,pipeline.predict(X_test))

## A bit better KNN model Cross Validation

In [None]:
# Import random_state
from pandas.core.common import random_state

# Initialize parameters for KNeighborsClassifier and DecisionTreeClassifier
k = [5,10,15]
max_depth = [20, 40, 60]


# Iterate over the parameter combinations
for i in range(3):
    models = []
    models.append(('KNN', KNeighborsClassifier(n_neighbors=k[i])))
    models.append(('NB', GaussianNB()))
    models.append(('DT', DecisionTreeClassifier(max_depth=max_depth[i], random_state=101)))
    results = []
    names = []
    print("K:",k[i], "and max_depth:", max_depth[i])
    for name, model in models:
        kfold = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
        results.append(cv_results)
        names.append(name)
        print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
    print("")

In [None]:
# Need to create a KNeighborsClassifier model with n_neighbors set to 5
model_KNN = KNeighborsClassifier(n_neighbors=5)

# Fit the model on the training data
model_KNN.fit(X_train, y_train)

# Here is make predictions using the trained model on the test data 
predict_KNN = model_KNN.predict(X_test)


In [None]:
# To create a GaussianNB model 
model_NB = GaussianNB()

# Fit the model on the training data
model_NB.fit(X_train, y_train)

# Make predictions using the trained model on the test data
predict_NB = model_NB.predict(X_test)


In [None]:
# This is DecisionTreeClassifier model with max_depth=40 and random_state=101
model_DT = DecisionTreeClassifier(max_depth=40, random_state=101)

# Fit the model on the training data
model_DT.fit(X_train, y_train)

# Make predictions using the trained model on the test data
predict_DT = model_DT.predict(X_test)


In [None]:
# Calculate the confusion matrix using the true labels (y_test) and the predicted labels (predict_KNN)
confusion_matrix = metrics.confusion_matrix(y_test, predict_KNN)

# Create a confusion matrix display with the calculated confusion matrix and display labels [0, 1, 2]
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=[0, 1, 2])

# Plot the confusion matrix
cm_display.plot()
plt.show()


In [None]:
# Here is our printed classification report of the KNN model predictions
print(classification_report(y_test, predict_KNN))

In [None]:
# Plot the confusion matrix for the Naive Bayes model predictions
confusion_matrix = metrics.confusion_matrix(y_test, predict_NB)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=[0, 1, 2])
cm_display.plot()
plt.show()

In [None]:
# Here is our printed classification report of the KNN model predictions
print(classification_report(y_test, predict_NB))

In [None]:
# Plot the confusion matrix for the Naive Bayes model predictions
confusion_matrix = metrics.confusion_matrix(y_test, predict_DT)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0,1,2])
cm_display.plot()
plt.show()

In [None]:
# Here is our printed classification report of the KNN model predictions
print(classification_report(y_test, predict_DT))

###

### Thank you