# Use of smartphones and People with Computer Anxiety

## Dataset

**Description:** Dataset containing metrics and scale results from field study performed at @CRECI. The study aims at supporting personalization features for people with Computer Anxiety (PwCA). The analyses use data from interaction logs to identify levels of Computer Anxiety (CA)

**Goal:** Identify attributes to support regression for CARS values and classification using groups of PwCA.

In [None]:
import pandas as pd
import numpy as np
import imblearn

from matplotlib import pyplot as plt
from sklearn import preprocessing
from scipy.stats import ttest_ind
from scipy.stats import mannwhitneyu
from scipy.stats import shapiro

from ipyfilechooser import FileChooser
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import RandomOverSampler

## Loading Excel File

In [None]:
fc = FileChooser('C:/Users/thiag/Documents/Doutorado/CHI 2021') ; # https://pypi.org/project/ipyfilechooser/
display(fc)
# /Users/vagsant/Documents/supervision/current/thiago santos/data/elementos-analise-video-graficos2.xlsx

In [None]:
print(fc.selected)

In [None]:
xlsx = pd.ExcelFile( fc.selected ) ;
df = pd.read_excel( xlsx, 'Todos os dados' ) ;

## Comparing Age vs. CARS

In [None]:
df_high = df[df['CARS'] > 47]
df_low = df[df['CARS'] < 34]
df_moderate = df[df['CARS'] >= 34]
df_moderate = df_moderate[df_moderate['CARS'] <= 47]

In [None]:
len( df_low )

In [None]:
len( df_moderate )

In [None]:
len( df_high )

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
plt.scatter(df_low['Age'], df_low['CARS'],  label='Low CARS', norm=False)
plt.scatter(df_moderate['Age'], df_moderate['CARS'],  label='Moderate CARS', norm=True)
plt.scatter(df_high['Age'], df_high['CARS'],  label='High CARS', norm=True)
ax.legend()
plt.ylabel('CARS')
plt.title('Age (years)')
plt.show()

In [None]:
swtest, p_age = shapiro( df[ 'Age' ] )
swtest, p_cars = shapiro( df[ 'CARS' ] )

print( 'Shapiro-Wilk (Age) p-value: {:.5f}'.format( p_age ) ) 
print( 'Shapiro-Wilk (CARS) p-value: {:.5f}'.format( p_cars ) ) 

In [None]:
k = 'Age'
utest, p_lm = mannwhitneyu( df_low[ k ], df_moderate[ k ] )
print(utest)
utest, p_lh = mannwhitneyu( df_low[ k ], df_high[ k ] )
print(utest)
utest, p_mh = mannwhitneyu( df_moderate[ k ], df_high[ k ] )
print(utest)

print( 'Low CARS vs. Moderate CARS p-value {:.5f}'.format( p_lm ) ) 
print( 'Low CARS vs. High CARS p-value {:.5f}'.format( p_lh ) ) 
print( 'Moderate CARS vs. High CARS p-value {:.5f}'.format( p_mh ) ) 


In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
plt.boxplot([df_low['Age'], df_moderate['Age'], df_high['Age']])
ax.legend()
ax.set_xticklabels(['Low CARS', 'Moderate CARS', 'High CARS']) 
plt.ylabel('Age')
plt.title('Age vs. CARS groups')
plt.show()

In [None]:
k = 'STAI-T'
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
plt.boxplot([df_low[k], df_moderate[k], df_high[k]])
ax.legend()
ax.set_xticklabels(['Low CARS', 'Moderate CARS', 'High CARS']) 
plt.ylabel(k)
plt.title( k + ' vs. CARS groups')
plt.show()

In [None]:
k = 'STAI-E'
fig, ax = plt.subplots()
fig.set_size_inches(18.5, 10.5)
plt.boxplot([df_low[k], df_moderate[k], df_high[k]])
ax.legend()
ax.set_xticklabels(['Low CARS', 'Moderate CARS', 'High CARS']) 
plt.ylabel(k)
plt.title( k + ' vs. CARS groups')
plt.show()

In [None]:
swtest, p_stai_t = shapiro( df[ 'STAI-T' ] )
swtest, p_stai_e = shapiro( df[ 'STAI-E' ] )

print( 'Shapiro-Wilk (STAI-T) p-value: {:.5f}'.format( p_stai_t ) ) 
print( 'Shapiro-Wilk (STAI-E) p-value: {:.5f}'.format( p_stai_e ) ) 

In [None]:
k = 'STAI-T'
utest, p_lm = mannwhitneyu( df_low[ k ], df_moderate[ k ] )
utest, p_lh = mannwhitneyu( df_low[ k ], df_high[ k ] )
utest, p_mh = mannwhitneyu( df_moderate[ k ], df_high[ k ] )

print( 'Low CARS vs. Moderate CARS p-value {:.5f}'.format( p_lm ) ) 
print( 'Low CARS vs. High CARS p-value {:.5f}'.format( p_lh ) ) 
print( 'Moderate CARS vs. High CARS p-value {:.5f}'.format( p_mh ) ) 

In [None]:
k = 'STAI-E'
utest, p_lm = mannwhitneyu( df_low[ k ], df_moderate[ k ] )
utest, p_lh = mannwhitneyu( df_low[ k ], df_high[ k ] )
utest, p_mh = mannwhitneyu( df_moderate[ k ], df_high[ k ] )

print( 'Low CARS vs. Moderate CARS p-value {:.5f}'.format( p_lm ) ) 
print( 'Low CARS vs. High CARS p-value {:.5f}'.format( p_lh ) ) 
print( 'Moderate CARS vs. High CARS p-value {:.5f}'.format( p_mh ) ) 

# Building a classifier after resampling CARS groups

In [None]:
# Task Time(sec)
# Clicks Number
# DB Clicks Number
# MEAN CLICK DURATION (sec)
# TYPING VELOCITY (key/min)
# TOTAL TIME TYPING (sec)

X = df.iloc[:, [14, 21, 22, 23, 32, 33] ] # Columns that could be recorded in a mobile seting
y = df['CLASS']

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
print ( len( y_over ) )
X_over, y_over = oversample.fit_resample(X_over, y_over)
print ( len( y_over ) )

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over, test_size = 0.3, random_state = 0)

In [None]:
import xgboost as xgb
from xgboost import XGBClassifier

classifier = XGBClassifier()
classifier.fit(X_train_over, y_train_over)

In [None]:
y_pred_over = classifier.predict(X_test_over)

In [None]:
y_test_over == y_pred_over

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test_over, y_pred_over)
cm

In [None]:
accuracies = cross_val_score(estimator = classifier, X = X_train_over, y = y_train_over, cv = 3)
print( accuracies.mean() )
print( accuracies.std() )

# Building a regressor for CARS values


In [None]:
# Importing the dataset

# Columns
# Age
# Education Levels (years)
# Minimental
# GDS
# CSE
# STAI-T
# STAI-E
# SUS
# Minimental - Result
# GDS - Result
# Aproved
# Task Completion
# Task End
# Task Time(sec)
# Events Number
# Nodes Number
# Eccentricity
# Incidentes Number
# Mean Degree
# MOUSE DOWN-UP
# Clicks Number
# DB Clicks Number
# MEAN CLICK DURATION (sec)
# MEAN PAUSE BEFORE CLICK (sec)
# MOUSE TOTAL DISTANCE (px)
# MOUSE MEAN DISTANCE (px)
# MOUSE MEAN VELOCITY (px/sec)
# MEAN STROKE LENGTH (px)
# MEAN STROKE DURATION (sec)
# MEAN STRAIGHTNESS
# KEYS
# TYPING VELOCITY (key/min)
# TOTAL TIME TYPING (sec)
# MEAN TIME TYPING (sec)
# DELETE
# BACKSPACE
# DELETE + BACKSPACE
# Gaze Total Distance (px)
# Gaze Mean Distance (px)
# Gaze Total Time (sec)
# Gaze Velocity (px/s)
# Mean Pupil Size (norm)
# Pupil standard deviation (norm)
# High Three Sigma
# Low three sigma
# High Outliers
# Low Outliers
# CARS


# X = df.iloc[:, 1:47]
# X = X.drop( columns = ['Minimental - Result', 'GDS - Result', 'Aproved', 'Task Completion', 'Task End'] )

# X = df.iloc[:, 1:9]
# X = df.iloc[:, 21:25]
X = df.iloc[:, [14, 21, 22, 23, 32, 33] ] # Columns that could be recorded in a mobile seting
y = df['CARS']


In [None]:
# Feature Scaling
sc = StandardScaler()

# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

X = sc.fit_transform(X)

In [None]:
# Finding good n_estimators
from sklearn.ensemble import RandomForestRegressor
i = 10
mse = 100**100
n_estimators = i
r2 = 0.0
while i < 10000:
    regressor = RandomForestRegressor(n_estimators = i, random_state = 0)
    regressor.fit(X, y)
    y_pred = regressor.predict(X)
    print( 'R2: ' + str( r2_score(y, y_pred ) ) + '\tn_estimators = ' + str( i ) + '\tMSE = ' + str( mean_squared_error(y, y_pred) ) )
    if( mean_squared_error(y, y_pred) < mse ):
        mse = mean_squared_error(y, y_pred)
        n_estimators = i
        r2 = r2_score( y, y_pred )
    i = i * 2
    
print( '--> Best n_estimators=' + str( n_estimators ) + ' with MSE=' + str( mse ) + ' and R2=' + str( r2 ) ) 

In [None]:
regressor = RandomForestRegressor(n_estimators = n_estimators, random_state = 0)
regressor.fit(X, y)

In [None]:
y_pred = regressor.predict(X)
y - y_pred

In [None]:
# Visualising the Random Forest Regression results (higher resolution)
padding = 10
min_y = min( min( y ), min( y_pred ) ) - padding
max_y = max( max( y ), max( y_pred ) ) + padding
plt.figure( figsize = ( 20, 10 ) ) ;
plt.scatter( y, regressor.predict(X), c = abs( y_pred - y ), alpha = 1.0, cmap=plt.cm.get_cmap('RdYlGn_r'))
plt.plot( list( range( min_y, max_y) ), list( range( min_y, max_y) ), color = 'green' ) 
plt.xlim( min_y, max_y )
plt.ylim( min_y, max_y )
plt.title('Random Forest Regression')
plt.xlabel('CARS')
plt.ylabel('Prediction')
plt.show()

# Buliding a regressor after over resampling

In [None]:
# Task Time(sec)
# Clicks Number
# DB Clicks Number
# MEAN CLICK DURATION (sec)
# TYPING VELOCITY (key/min)
# TOTAL TIME TYPING (sec)

X = df.iloc[:, [14, 21, 22, 23, 32, 33] ] # Columns that could be recorded in a mobile seting
y = df['CARS']

In [None]:
plt.hist( y )

In [None]:
print( len( y ) )

In [None]:
oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
size = len( y_over )
prev_size = 0
while size > prev_size:
    prev_size = size 
    X_over, y_over = oversample.fit_resample(X_over, y_over) # must be only resample after previous fit
    size = len( y_over )
    print( size )

In [None]:
plt.hist( y_over )

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over, test_size = 0.2, random_state = 0)

In [None]:
# Feature Scaling
sc = StandardScaler()

X_train = sc.fit_transform( X_train_over )
X_test = sc.transform( X_test_over )

In [None]:
# Finding good n_estimators
from sklearn.ensemble import RandomForestRegressor
i = 10
mse = 100**100
n_estimators = i
r2 = 0.0
while i < 10000:
    regressor = RandomForestRegressor(n_estimators = i, random_state = 0)
    regressor.fit(X_train_over, y_train_over)
    y_pred = regressor.predict(X_test_over)
    print( 'R2: ' + str( r2_score(y_test_over, y_pred ) ) + '\tn_estimators = ' + str( i ) + '\tMSE = ' + str( mean_squared_error(y_test_over, y_pred) ) )
    if( mean_squared_error(y_test_over, y_pred) < mse ):
        mse = mean_squared_error(y_test_over, y_pred)
        n_estimators = i
        r2 = r2_score( y_test_over, y_pred )
    i = i * 2
    
print( '--> Best n_estimators=' + str( n_estimators ) + ' with MSE=' + str( mse ) + ' and R2=' + str( r2 ) ) 

In [None]:
#regressor = RandomForestRegressor(n_estimators = i, random_state = 0)
#print('i = ' + str(i) + 'best i '+ str( n_estimators ) )
regressor = RandomForestRegressor(n_estimators = n_estimators, random_state = 0)
regressor.fit(X_train_over, y_train_over)
y_pred = regressor.predict(X_test_over)

In [None]:
abs( y_test_over - y_pred )

In [None]:
# Visualising the Random Forest Regression results (higher resolution)
padding = 10
min_y = min( min( y_test_over ), min( y_pred ) ) - padding
max_y = max( max( y_test_over ), max( y_pred ) ) + padding
plt.figure( figsize = ( 20, 10 ) ) ;
# plt.scatter( y_test_over, regressor.predict(X_test_over), c = abs( y_pred - y_test_over ), alpha = 1.0, cmap=plt.cm.get_cmap('RdYlGn_r'))
plt.scatter( y_test_over, regressor.predict(X_test_over), c = 'green')
plt.plot( list( range( min_y, max_y) ), list( range( min_y, max_y) ), color = 'green' ) 
plt.xlim( min_y, max_y )
plt.ylim( min_y, max_y )
plt.title('Random Forest Regression')
plt.xlabel('CARS')
plt.ylabel('Prediction')
plt.show()

# Building a Decision Tree

In [None]:
X = df.iloc[:, [14, 21, 22, 23, 32, 33] ] # Columns that could be recorded in a mobile seting
y = df['CLASS']

oversample = RandomOverSampler(sampling_strategy='minority')
X_over, y_over = oversample.fit_resample(X, y)
size = len( y_over )
prev_size = 0
while size > prev_size:
    prev_size = size 
    X_over, y_over = oversample.fit_resample(X_over, y_over) # must be only resample after previous fit
    size = len( y_over )
    print( size )
    
    
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train_over, X_test_over, y_train_over, y_test_over = train_test_split(X_over, y_over, test_size = 0.2, random_state = 0)

# Feature Scaling
sc = StandardScaler()

X_train = sc.fit_transform( X_train_over )
X_test = sc.transform( X_test_over )

In [None]:
import pandas
from sklearn import tree
import pydotplus
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import matplotlib.image as pltimg

import os     
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin/'

dtree = DecisionTreeClassifier()
dtree = dtree.fit(X_train_over, y_train_over)




#y_pred2 = dtree.predict(X_test_over)
#dtree.score(y_pred2, y_pred2)
score1 = dtree.score(X_test_over, y_test_over)
score2 = dtree.score(X_train_over, y_train_over)
score3 = dtree.score(X, y)

print("Score 1 ", score1)
print("Score 2 ", score2)
print("Score 3 ", score3)


#new_series = pd.Series(y_pred2)
#print(new_series)


#abs( y_test_over - y_pred2 )



In [None]:
col_names = X.columns

data = tree.export_graphviz(dtree, out_file=None, feature_names=col_names,filled=True, rounded=True)
graph = pydotplus.graph_from_dot_data(data)
graph.set_size('"20,20!"')
graph.write_png('decisionTree.png')

graph.write_png('resized_tree.png')



img=pltimg.imread('resized_tree.png')
imgplot = plt.imshow(img)
plt.show()

import graphviz
gvz_graph = graphviz.Source(graph.to_string())
gvz_graph

In [None]:
yp = dtree.predict(X_test_over)
# Visualising the Random Forest Regression results (higher resolution)
padding = 10
min_y = min( min( y_test_over ), min( yp ) ) - padding
max_y = max( max( y_test_over ), max( yp ) ) + padding
plt.figure( figsize = ( 20, 10 ) ) ;
# plt.scatter( y_test_over, regressor.predict(X_test_over), c = abs( y_pred - y_test_over ), alpha = 1.0, cmap=plt.cm.get_cmap('RdYlGn_r'))
plt.scatter( y_test_over, lreg.predict(X_test_over), c = 'green')
plt.plot( list( range( min_y, max_y) ), list( range( min_y, max_y) ), color = 'green' ) 
plt.xlim( min_y, max_y )
plt.ylim( min_y, max_y )
plt.title('Linear Regression')
plt.xlabel('CARS')
plt.ylabel('Prediction')
plt.show()

# Building a Linear Regression

In [None]:
# Create the linear regressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


#regressor.fit(X_train_over, y_train_over)
#y_pred = regressor.predict(X_test_over)

#print(X_test_over)
#print(y_test_over)

lreg = LinearRegression()
lreg.fit(X_train_over, y_train_over) 


##########


# Make predictions using the testing set
y_pred = lreg.predict(X_test_over)
#print("yy ", len(y_pred))
#print("xx ", len(X_test_over))


# The coefficients
print('Coefficients: \n', lreg.coef_)
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test_over, y_pred))
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: %.2f'
      % r2_score(y_test_over, y_pred))


scor = lreg.score(X_train_over, y_train_over)
print("score ", scor)
#yp = lreg.predict(X_test_over)
#print("yp ",yp )


#abs( y_test_over - yp )


In [None]:
# Visualising the Random Forest Regression results (higher resolution)
padding = 10
min_y = min( min( y_test_over ), min( yp ) ) - padding
max_y = max( max( y_test_over ), max( yp ) ) + padding
plt.figure( figsize = ( 20, 10 ) ) ;
# plt.scatter( y_test_over, regressor.predict(X_test_over), c = abs( y_pred - y_test_over ), alpha = 1.0, cmap=plt.cm.get_cmap('RdYlGn_r'))
plt.scatter( y_test_over, lreg.predict(X_test_over), c = 'green')
plt.plot( list( range( min_y, max_y) ), list( range( min_y, max_y) ), color = 'green' ) 
plt.xlim( min_y, max_y )
plt.ylim( min_y, max_y )
plt.title('Linear Regression')
plt.xlabel('CARS')
plt.ylabel('Prediction')
plt.show()

   # Tree Classifier

In [None]:
from sklearn import metrics 
clf = DecisionTreeClassifier(max_depth=None, min_samples_split=2, random_state=0)
clf.fit(X_train_over, y_train_over) 


test_pred_decision_tree = clf.predict(X_test_over)
metrics.accuracy_score(y_test_over, test_pred_decision_tree)

#y_pred = clf.predict(X_test_over)
#abs( y_test_over - yp )

In [None]:
from sklearn.ensemble import RandomForestClassifier
#print(X)
#print(y)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train_over, y_train_over) 


test_pred_decision_tree = clf.predict(X_test_over)
metrics.accuracy_score(y_test_over, test_pred_decision_tree)

#y_pred = clf.predict(X_test_over)
#abs( y_test_over - yp )