# Conduct Linear Regression Analysis on Identification Accuracy

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Import local excel file after initial manual cleaning
# Refer to external resource found at here: https://neptune.ai/blog/google-colab-dealing-with-files
# https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html
df = pd.read_csv('survey_cleaned.csv')
df

In [None]:
df = df.drop(columns = ['Unnamed: 0'])

In [None]:
# Show some row of the cleaned dataframe
# Perform manual random check upon rows with the original excel file
# To make sure that the data cleanings are performed correctly
# Refer to the external resource found at here
# https://stackoverflow.com/questions/11707586/how-do-i-expand-the-output-display-to-see-more-columns-of-a-pandas-dataframe
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
df.iloc[46]

In [None]:
# Add a column of ones for linear regression intercept
# Refer to the external resource found at here
# https://stackoverflow.com/questions/29517072/add-column-to-dataframe-with-constant-value
df['Coefficient'] = 1

In [None]:
df.columns

Index(['IP_ Address', 'Duration', 'Colours_Painting1', 'Colours_Painting2',
       'Colours_Painting3', 'Colours_Painting4', 'Colours_Painting5',
       'Colours_Painting6', 'Colours_Painting7', 'Colours_Painting8',
       'Colours_Painting9', 'Colours_Painting10', 'Brushstrokes_Painting1',
       'Brushstrokes_Painting2', 'Brushstrokes_Painting3',
       'Brushstrokes_Painting4', 'Brushstrokes_Painting5',
       'Brushstrokes_Painting6', 'Brushstrokes_Painting7',
       'Brushstrokes_Painting8', 'Brushstrokes_Painting9',
       'Brushstrokes_Painting10', 'Structures_Painting1',
       'Structures_Painting2', 'Structures_Painting3', 'Structures_Painting4',
       'Structures_Painting5', 'Structures_Painting6', 'Structures_Painting7',
       'Structures_Painting8', 'Structures_Painting9', 'Structures_Painting10',
       'Details_Painting1', 'Details_Painting2', 'Details_Painting3',
       'Details_Painting4', 'Details_Painting5', 'Details_Painting6',
       'Details_Painting7', 'Details

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.3, random_state=0, shuffle=True)

In [None]:
# For intepretation of the model summary
# Refer to the external resource found at here
# https://medium.com/swlh/interpreting-linear-regression-through-statsmodels-summary-4796d359035a
# https://datatofish.com/multiple-linear-regression-python/
# https://dev.to/alod83/3-different-approaches-for-traintest-splitting-of-a-pandas-dataframe-31p0
# https://www.statology.org/sklearn-linear-regression-summary/
import statsmodels.api as sm
X_train = train[['Duration', 'Coefficient',
                        'Identification_Colours', 'Identification_Brushstrokes', 'Identification_Structures', 'Identification_Details',
                        'Condition_Control', 'Condition_Treatment1', 'Condition_Treatment2',
                        'Monthly_Art_Events', 'Monthly_Hours_Digital_Arts', 'Artistic_Specialty_Self_Rating']]
Y_train = train[['Accuracy']]

#fit linear regression model
model = sm.OLS(Y_train, X_train).fit()

#view model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:               Accuracy   R-squared:                       0.150
Model:                            OLS   Adj. R-squared:                  0.060
Method:                 Least Squares   F-statistic:                     1.659
Date:                Wed, 22 Nov 2023   Prob (F-statistic):              0.102
Time:                        01:03:28   Log-Likelihood:                 54.326
No. Observations:                 105   AIC:                            -86.65
Df Residuals:                      94   BIC:                            -57.46
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Duration    

In [None]:
# Extract the p-values of the linear regression
# Refer to the external resource found at here
# https://www.statology.org/statsmodels-linear-regression-p-value/
model.pvalues

Duration                          7.563878e-02
Coefficient                       3.855223e-11
Identification_Colours            4.567299e-01
Identification_Brushstrokes       1.555894e-01
Identification_Structures         5.868610e-01
Identification_Details            1.403065e-01
Condition_Control                 7.968128e-04
Condition_Treatment1              1.511187e-04
Condition_Treatment2              2.359036e-08
Monthly_Art_Events                2.949878e-01
Monthly_Hours_Digital_Arts        1.621839e-01
Artistic_Specialty_Self_Rating    3.020268e-01
dtype: float64

In [None]:
X_test = test[['Duration', 'Coefficient',
                        'Identification_Colours', 'Identification_Brushstrokes', 'Identification_Structures', 'Identification_Details',
                        'Condition_Control', 'Condition_Treatment1', 'Condition_Treatment2',
                        'Monthly_Art_Events', 'Monthly_Hours_Digital_Arts', 'Artistic_Specialty_Self_Rating']]
Y_test = test[['Accuracy']]
Y_pred = model.predict(X_test)

In [None]:
# Conver to numpy array
# Refer to the external resource found at here
# https://pandas.pydata.org/pandas-docs/version/0.24.0rc1/api/generated/pandas.Series.to_numpy.html
Y_test = Y_test.to_numpy()

In [None]:
# Evaluate on the testing dataset after prediction
# Caculate mse, rmse, mae
# Refer to the external resource found at here
# https://www.statology.org/mean-squared-error-python/
# https://datagy.io/mae-python/
def mse(actual, pred):
    actual, pred = np.array(actual), np.array(pred)
    return np.square(np.subtract(actual,pred)).mean()
def rmse(actual, pred):
    actual, pred = np.array(actual), np.array(pred)
    return np.sqrt(np.square(np.subtract(actual,pred)).mean())
def mae(y_true, predictions):
    y_true, predictions = np.array(y_true), np.array(predictions)
    return np.mean(np.abs(y_true - predictions))

In [None]:
mse(Y_test, Y_pred)

0.02705323803239297

In [None]:
rmse(Y_test, Y_pred)

0.16447868564769408

In [None]:
mae(Y_test, Y_pred)

0.12687945067138673

In [None]:
# Calculate the mean accuracy for each condition group
# Refer to the external resource found at here
# https://www.statology.org/conditional-mean-pandas/
df.loc[df['Condition_Treatment1'] == 1, 'Accuracy'].mean()

0.46249999999999997

In [None]:
df.loc[df['Condition_Treatment2'] == 1, 'Accuracy'].mean()

0.51

In [None]:
df.loc[df['Condition_Control'] == 1, 'Accuracy'].mean()

0.4576923076923076