# Empathy

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
import glob
import sys

from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, LeaveOneOut, cross_val_score, GroupKFold, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, median_absolute_error, explained_variance_score

import pickle

warnings.filterwarnings("ignore")


### Import the empathyhelper

In [2]:
import utils

In [None]:
from utils import process_input, summarized_eye_tracking_data
from utils import display_diameter_certainty_visuals, visualize_comparison_scores
from utils import  display_corr_heatmap, train_evaluate_model, display_score_visualization

# Read the files

In [None]:
data_dir = glob.glob('EyeTrackingData/*.csv')

## Overall Pipeline

In [None]:
# Create an empty list to store the DataFrames
summary_list = []
iteration_count = 0

# List of filenames
data_files = glob.glob('EyeT/*.csv')

# Initialize loop control variable
current_index = 0

# Execute the loop at least once using a do-while approach
while True:
    # Read the data from the current filename into a DataFrame
    current_filename = data_files[current_index]
    raw_data = pd.read_csv(current_filename, usecols=lambda col: col != 0, low_memory=True)
    
    # Apply preprocessing to the data
    processed_data = process_input(raw_data)
    
    file_base_name = os.path.basename(current_filename)
    
    if file_base_name.startswith('EyeT_group_dataset_III_'):
        group_label = 'Test group experiment'
    elif file_base_name.startswith('EyeT_group_dataset_II_'):
        group_label = 'Control group experiment'
    
    # Apply data summarization function
    summarized_data = summarized_eye_tracking_data(process_input, group_label)
    summary_list.append(summarized_data)
    
    iteration_count += 1
    current_index += 1
    
    # Break the loop if all files have been processed
    if current_index >= len(data_files):
        break

# Concatenate all the DataFrames into a single DataFrame
summary_dataframe = pd.concat(summary_list, ignore_index=True)

In [None]:
summary_dataframe.head()

# Exploratory Data Analysis

In [None]:
summary_dataframe.head()

###### Get the count for Control group experiment and Test group experiment

In [None]:
# Count the number of participants in each project
participants_per_project = summary_dataframe['Group_Name'].value_counts()

control_group_count = participants_per_project.get('Control group experiment', 0)
test_group_count = participants_per_project.get('Test group experiment', 0)

print(f'Total participants in Control group: {control_group_count}')
print(f'Total participants in Test group: {test_group_count}')


In [None]:
summary_dataframe.describe()

In [None]:
summary_dataframe.shape

In [None]:
summary_dataframe.info()

In [None]:
empathy_score = pd.read_csv('empathy_score/Questionnaire_datasetIB.csv', encoding='cp1252')

In [None]:
empathy_score.describe()

In [None]:
empathy_score.head()

In [None]:
# Merge the data features on columns 'Participant Name' and 'Participant Number'
final_data = pd.merge(summary_dataframe, empathy_score[['Participant nr', 'Total Score extended']], left_on='Participant', right_on='Participant nr', how='left')

# Drop the 'Participant nr' column as it's no longer needed
final_data.drop(columns=['Participant nr'], inplace=True)

In [None]:
final_data.describe()

In [None]:
final_data.shape

In [None]:
final_data.info()

In [None]:
# Save the DataFrame
final_data.to_csv('output_data.csv', index=False)

In [None]:
final_data.head()

In [None]:
# Get unique study group names
unique_groups = final_data['Study Group'].unique()

# Define a color palette for each group
color_palette = ['blue', 'red', 'green']

# Create subplots
fig, ax = plt.subplots()

# Loop through unique groups using list comprehension
scatter_plots = [ax.scatter(group_data['Avg_Pupil_Diameter'], group_data['Total_Empathy_Score'], c=color_palette[i], label=group_name)
                 for i, group_name in enumerate(unique_groups)
                 if (group_data := final_data[final_data['Study_Group'] == group_name]) is not None]

# Add labels and title
ax.set_xlabel('Average Pupil Diameter')
ax.set_ylabel('Empathy Score')
ax.set_title('Pupil Diameter vs Empathy Score by Study Group')

ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)

# Display the plot
plt.show()


In [None]:
# Get unique project names
distinct_projects = final_data['Project Name'].unique()

# List of colors for each project
color_palette = ['blue', 'red']

# Create subplots
fig, ax = plt.subplots()

# Loop through unique projects and create scatter plots
scatter_plots = [ax.scatter(project_data['Total Fixations'], project_data['Total Empathy Score'], c=color_palette[index], label=project)
                 for index, project in enumerate(distinct_projects)]

# Add labels and title
ax.set_xlabel('Total Fixations')
ax.set_ylabel('Empathy Score')
ax.set_title('Total Fixations vs Empathy Score by Project')

ax.legend()

# Display the plot
plt.show()


In [None]:
# Get unique participant names
unique_individuals = final_data['Participant'].unique()

# Limit to the first 4 participants for the trend
selected_individuals = unique_individuals[:4]

# Create subplots
num_rows = len(selected_individuals)
fig, axs = plt.subplots(nrows=num_rows, figsize=(8, 5*num_rows))

# Loop through selected participants using list comprehension
for index, participant in enumerate(selected_individuals):
    participant_data = final_data[final_data['Participant'] == participant].head(6)

    grouped_data = participant_data.groupby('Occurrence').agg({'Average_Pupil_Diameter': 'mean', 'Median_Pupil_Diameter': 'mean', 'Pupil_Diameter_StdDev': 'mean'}).reset_index()

    ax = axs[index]

    ax.errorbar(grouped_data['Occurrence'], grouped_data['Average_Pupil_Diameter'], grouped_data['Pupil_Diameter_StdDev'], linestyle='-', marker='o', capsize=5, ecolor="green", elinewidth=0.5, label='Mean')

    ax.plot(grouped_data['Occurrence'], grouped_data['Median_Pupil_Diameter'], linestyle='-', marker='s', label='Median')

    # Add labels and title
    ax.set_xlabel('Occurrence')
    ax.set_ylabel('Avg Pupil Diameter (mm)')
    ax.set_title(f'Mean and Median for Participant: {participant}')

    # Add legend
    ax.legend()

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
# Create a dictionary to store the DataFrames for each project
project_data_dict = {}

# Get unique project names
unique_projects = final_data['Project_Name'].unique()

# Loop through unique projects using list comprehension
project_data_dict = {project: final_data[final_data['Project_Name'] == project] for project in unique_projects}


In [None]:
target_project = 'Control group experiment'
project_data_frames = [df for proj_name, df in project_data_dict.items() if proj_name == target_project]

if project_data_frames:
    control_data_frame = project_data_frames[0]
    control_data_frame.head()
else:
    print(f"No data found for the project: {target_project}")


In [None]:
target_project = 'Test group experiment'
selected_df = [df for project_name, df in project_dataframes.items() if project_name == target_project][0]
selected_df_head = selected_df.head()

print(f'First few rows of the {target_project} DataFrame:')
print(selected_df_head)


In [None]:
pupil_analysis_df = target_project[['Person', 'Average Pupil Diameter', 'Median Pupil Diameter', 'Pupil Diameter Standard Deviation', 'Group Name', 'Session Name', 'Total Empathy Score']].copy()
pupil_analysis_df.head()

In [None]:
pupil_metrics_df = target_project[['Person', 'Average Pupil Size', 'Median Pupil Size', 'Pupil Size StdDev', 'Group', 'Session', 'Total Empathy']].copy()
pupil_metrics_df.head()


In [None]:
display_corr_heatmap(target_project, 'Total Score extended', top_n=15)

In [None]:
display_corr_heatmap(selected_df_head, 'Total Score extended', top_n=15)

# Modeling

In [None]:
control_group_results = train_evaluate_model(target_project, "Control Group")

In [None]:
control_group_results.info()

In [None]:
visualize_comparison_scores(control_group_results)

# Test Group

In [None]:
test_group_results = train_evaluate_model(selected_df_head, "Test Group")

In [None]:
visualize_comparison_scores(test_group_results)

In [None]:
visualize_comparison_scores(results_test)

In [None]:
visualize_empathy_scores(results_control)

In [None]:
control_group_pupil_result = train_evaluate_model(pupil_control_df, "Control Group")

In [None]:
visualize_empathy_scores(pupil_results_control)

# Test Group

In [None]:
pupil_results_test = train_and_evaluate(pupil_test_df, "Test Group")

In [None]:
visualize_empathy_scores(pupil_results_test)