# Avatar questionnaire processing  

## Import packages 

In [1]:
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import ptitprince as pt
from statsmodels.stats.anova import *
from functools import reduce

## Function creation

1. extraRow: Creates a function that takes:
 - **df** = a data frame and checks if any of the elements of 
 - **yourlist** have a partial match with the content of the values of 
 - **variable** your variable of interest, if so it assigns the correspondedn matching value to 
 - **newvariable** the name of variable you want as an output 


In [2]:
def extraRow(df,yourlist,variable,newVariable):
    '''Creates a new variable that contains matching data from preexiting variable
    
Parameters
-------------
     
     df: your data frame,
     yourlist: List of values you want to target for the partial match search
     variable: Variable that may cointain the elements on yourlist
     newvariable: Name for  New Variable 
     
returns 
----------

    DataFrame
     '''
    for idx, row in df.iterrows():
        for l in yourlist:
            if l in row[variable]:
                df.loc[idx, newVariable] = l

## Preprocessing 

In [3]:
# Data set loading
df_file = '/Users/tracysanchezpacheco/Desktop/AvaQuest.csv'
df = pd.read_csv(df_file, sep= ',', header=None)
df.head()

Unnamed: 0,0,1,2,3
0,1.0,Remember_01_CmA,1.0,9627.0
1,1.0,Realistic_01_CmA,5.0,9627.0
2,1.0,Attractive_01_CmA,1.0,9627.0
3,2.0,Remember_29_Sa,1.0,9627.0
4,2.0,Realistic_29_Sa,4.0,9627.0


In [4]:
df = df.rename({0: 'order', 1: 'item', 2: 'likert', 3: 'ID'}, axis=1)
df.head()

Unnamed: 0,order,item,likert,ID
0,1.0,Remember_01_CmA,1.0,9627.0
1,1.0,Realistic_01_CmA,5.0,9627.0
2,1.0,Attractive_01_CmA,1.0,9627.0
3,2.0,Remember_29_Sa,1.0,9627.0
4,2.0,Realistic_29_Sa,4.0,9627.0


In [5]:
#Initial exploration of the data set 
df.item = df['item'].astype('category')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3720 entries, 0 to 3719
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   order   3696 non-null   float64 
 1   item    3696 non-null   category
 2   likert  3697 non-null   float64 
 3   ID      3696 non-null   float64 
dtypes: category(1), float64(3)
memory usage: 100.0 KB


In [6]:
#Creating Clarity Categorical variable 
df['avatarCat'] = df['item'].str.contains('Sa')
df.loc[:, 'avatarCategory'] = df['avatarCat'].replace({True: 'Passive', False: 'Active'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3720 entries, 0 to 3719
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   order           3696 non-null   float64 
 1   item            3696 non-null   category
 2   likert          3697 non-null   float64 
 3   ID              3696 non-null   float64 
 4   avatarCat       3696 non-null   object  
 5   avatarCategory  3696 non-null   object  
dtypes: category(1), float64(3), object(2)
memory usage: 158.1+ KB


In [7]:
df.head()

Unnamed: 0,order,item,likert,ID,avatarCat,avatarCategory
0,1.0,Remember_01_CmA,1.0,9627.0,False,Active
1,1.0,Realistic_01_CmA,5.0,9627.0,False,Active
2,1.0,Attractive_01_CmA,1.0,9627.0,False,Active
3,2.0,Remember_29_Sa,1.0,9627.0,True,Passive
4,2.0,Realistic_29_Sa,4.0,9627.0,True,Passive


In [8]:
#Checking that value counts for Action and Standing avatars corresponds with the number of participants
df.avatarCat.value_counts()

False    1848
True     1848
Name: avatarCat, dtype: int64

In [9]:
#List of values for iteration search 
itemCat = ['Remember','Realistic', 'Attractive']

#Searching for a partial string 'ItemCat' in variable 'Item' to creat a new variable 'ItemCat'
extraRow(df,itemCat,'item','itemCategory')
df.tail(5)

TypeError: argument of type 'float' is not iterable

In [None]:
#Extracting the Avatar number from the variable 'Item'
AvaIDs = []
for idx, row in df.iterrows():
    num = re.findall(r'\d+', row['item']) 
    AvaIDs.append(num)
    #Flatten the list: Every number was returned as a list of it's own with this we get one unified list
    avatarIDsf = [item for sublist in AvaIDs for item in sublist]


In [None]:
#Searching for a partial string contained in the lists "AvatarIDsf" in variable "Item" to creat a new variable "AvatarID"
extraRow(df,avatarIDsf,'item','avatarID')
df.head()

In [None]:
wide = df.pivot(index='ID', columns='item', values='likert')

In [None]:
os.chdir('/Volumes/Extreme SSD/HumanA/Data/Data_Tracy/Questionnaires/Adjusted')
df.to_csv('CompleteLargeAvatarQues.cvs')

# Descriptives 

In [None]:
#Average values Support and clarity 
df.groupby([ 'ID', 'avatarCategory','itemCategory'])['likert'].mean()

In [None]:
#Average values Support and clarity 
df.groupby(['ID','avatarCategory','itemCategory'])['likert'].describe().sort_values(by='mean')

In [None]:
AvatarIDsL = df.pivot_table('likert',  'avatarID').sort_values(by='likert')
AvatarIDsL

## Data Visualization

In [None]:
# General seaborn design settings
sns.set_palette("colorblind")
sns.set_style("white")
sns.set(rc={"figure.figsize":(12, 8)})
sns.set(font_scale = 1.3)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16,4))

sns.violinplot(data= df, x='avatarCategory', y='likert', ax=axes[0])
axes[0].set_xticklabels(['Meaningful', 'Not meaningful'])
axes[0].set_xlabel('Agent Category')
axes[0].set_ylabel('Average likert rating across all items')

sns.violinplot(data= df, x='itemCategory', y='likert', hue= 'avatarCategory',  ax=axes[1])
#Getting the legend components in order to not lose color when reassigning labels
handles, labels = axes[1].get_legend_handles_labels()
labels = ['Meaningful', 'Not meaningful']
axes[1].legend(handles, labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title='Agent Category')
axes[1].set_xticklabels(['Easy to remember', 'Realistic', 'Attractive'])
axes[1].set_ylabel('Average likert rating')
axes[1].set_xlabel('Question category')

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(16,4))

sns.boxplot(data= df, x='avatarCategory', y='likert',   medianprops=dict(color="black", alpha=0.7), ax=axes[0])
axes[0].set_xticklabels(['Active', 'Passive'])
axes[0].set_xlabel('Agent Category')
axes[0].set_ylabel('Average likert rating across all items')

sns.boxplot(data= df, x='itemCategory', y='likert', hue= 'avatarCategory', medianprops=dict(color="black", alpha=0.7), ax=axes[1])
#Getting the legend components in order to not lose color when reassigning labels
handles, labels = axes[1].get_legend_handles_labels()
labels = ['Active', 'Passive']
axes[1].legend(handles, labels, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., title='Agent Category')
axes[1].set_xticklabels(['Easy to remember', 'Realistic', 'Attractive'])
axes[1].set_ylabel('Average likert rating')
axes[1].set_xlabel('Question category')

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(18,10))
sns.boxplot(data= df, x='avatarID', y='likert')
plt.yticks(rotation=90)
plt.show()

In [None]:
dy='itemCategory'; dx='likert'; ort='h'; pal = sns.color_palette(); sigma = .2
f, ax = plt.subplots(figsize=(7, 5))

ax=pt.half_violinplot( x = dx, y = dy, data = df, palette = pal, bw = .2, cut = 0.,
                      scale = 'area', width = .6, inner = None, orient = ort)
ax=sns.stripplot( x = dx, y = dy, data = df, palette = pal, edgecolor = 'white',
                 size = 3, jitter = 1, zorder = 0, orient = ort)
ax=sns.boxplot( x = dx, y = dy, data = df, color = 'black', width = .15, zorder = 10,\
            showcaps = True, boxprops = {'facecolor':'none', 'zorder':10},\
            showfliers=True, whiskerprops = {'linewidth':2, 'zorder':10},\
               saturation = 1, orient = ort) 

## Data analysis

In [None]:
df.head()

In [None]:
from sklearn.datasets import load_iris
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
from factor_analyzer.factor_analyzer import calculate_kmo

#Droppin Nas
wide.dropna(inplace=True);

#Adecuacy test

#Barlets sphericity: A significative p value indicates the date is different from an identity matrix
chi_square_value, p_value = calculate_bartlett_sphericity(wide)
print(chi_square_value, p_value)

# Kaiser-Meyer-Olkin (KMO): Estimates the proportion of variance among all observed variables, less than 0.6 is inadequate
kmo_all, kmo_model = calculate_kmo(wide)
print(kmo_model)
#Create factor analysis object and perform analysis
fa = FactorAnalyzer(rotation='varimax')
fa.fit(wide)

loadings = fa.loadings_

# Get the eigenvector
ev, v = fa.get_eigenvalues()

xvals = range(1, wide.shape[1] + 1)
plt.scatter(xvals, ev)
plt.plot(xvals, ev)
plt.title('Scree Plot')
plt.xlabel('Factor')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()

In [None]:
wide = pd.DataFrame(wide)
wide.describe()

In [None]:
kmo_model

In [None]:
from statsmodels.stats.anova import AnovaRM
help(AnovaRM)

In [None]:
df.boxplot(["likert"], by = ["avatarCategory", "itemCategory"],
           figsize = (16, 9),
           showmeans = True,
           notch = True)

In [None]:
# Separete data set into question category's
df1 = df[df['itemCategory']== 'Remember']
df2 = df[df['itemCategory']== 'Attractive']
df3 = df[df['itemCategory']== 'Realistic']
df1.head()

In [None]:
#Sum all likert punctuations for the question on how well they remember by participant and avatar category
Topivot = pd.DataFrame(df1.groupby([ 'ID', 'avatarCategory'])['likert'].mean())
#Reformat to a wide dataframe
AvatarSum = pd.DataFrame(Topivot.pivot_table(index='ID', columns='avatarCategory', values='likert'))
AvatarSum.reset_index(drop=True)

In [None]:
AvatarSum.rename(columns={
    AvatarSum.columns[0]: 'remember_action_likert',
    AvatarSum.columns[1]: 'remember_standing_likert'}, inplace=True)

In [None]:
#Sum all likert punctuations for the question on how well they remember by participant and avatar category
Topivot1 = pd.DataFrame(df2.groupby([ 'ID', 'avatarCategory'])['likert'].mean())
#Reformat to a wide dataframe
AvatarSum1 = pd.DataFrame(Topivot1.pivot_table(index='ID', columns='avatarCategory', values='likert'))
AvatarSum1.reset_index(drop=True)

In [None]:
AvatarSum1.rename(columns={
    AvatarSum1.columns[0]: 'attractive_action_likert',
    AvatarSum1.columns[1]: 'attractive_standing_likert'}, inplace=True)

In [None]:
#Sum all likert punctuations for the question on how well they remember by participant and avatar category
Topivot2 = pd.DataFrame(df3.groupby([ 'ID', 'avatarCategory'])['likert'].mean())
#Reformat to a wide dataframe
AvatarSum2 = pd.DataFrame(Topivot2.pivot_table(index='ID', columns='avatarCategory', values='likert'))
AvatarSum2.reset_index(drop=True)

In [None]:
AvatarSum2.rename(columns={
    AvatarSum2.columns[0]: 'realistic_action_likert',
    AvatarSum2.columns[1]: 'realistic_standing_likert'}, inplace=True)

In [None]:
data_frames = [AvatarSum, AvatarSum1, AvatarSum2]
AgentQuest = reduce(lambda  left,right: pd.merge(left,right,on=['ID'],
                                                         how='outer'), data_frames)
AgentQuest.head()

In [None]:
AgentQuest.to_csv('AgentQuest.csv')
wide.to_csv('AvatarQuestWide.csv')

In [None]:
print(AnovaRM(data=df, depvar='likert', subject='ID', within=['avatarCategory', 'itemCategory'], aggregate_func = 'median').fit())

In [None]:
from statsmodels.formula.api import ols, glm
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
# Define model formula
formula = 'likert  ~ C(avatarCategory, Treatment(reference="Passive")) + C(itemCategory, Treatment(reference="Realistic")) + C(avatarCategory, Treatment(reference="Passive")):C(itemCategory, Treatment(reference="Realistic"))'

# Define and fit a linear regression model
model_LM = smf.mixedlm(formula = formula, data = df, groups=df['ID']).fit()
print(model_LM.summary())
model_LM.t_test(np.array([[0,1,1,0,0,0]]))

In [None]:
ax = sns.boxplot(x = model_LM.model.groups, y = model_LM.resid)

ax.set_title("Distribution of Residuals per subject")
ax.set_ylabel("Residuals")
ax.set_xlabel("Subject")
plt.xticks(rotation = 45)

In [None]:
df['AgentCat_ItemCat'] =  df['avatarCategory'] + '_' + df['itemCategory']
df.head()

In [None]:
df_Dunn = df[['AgentCat_ItemCat', 'likert']]

In [None]:
import scikit_posthocs as sp
posthoc = sp.posthoc_dunn(df_Dunn, val_col='likert', group_col='AgentCat_ItemCat', p_adjust = 'bonferroni')
posthoc

def highlight_sig(s):
    is_sig = s < 0.0019
    return ['background-color: cyan' if v else '' for v in is_sig]
posthoc.style.apply(highlight_sig)