In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
# logistic regression
from sklearn.linear_model import LogisticRegression
# support vector machine
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# classification report
from sklearn.metrics import classification_report


In [2]:
df = pd.read_csv('SharedResponsesSurvey_sample.csv')
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19464 entries, 0 to 23732
Data columns (total 28 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ResponseID               19464 non-null  object 
 1   ExtendedSessionID        19464 non-null  object 
 2   UserID                   19464 non-null  float64
 3   ScenarioOrder            19464 non-null  int64  
 4   Intervention             19464 non-null  int64  
 5   PedPed                   19464 non-null  int64  
 6   Barrier                  19464 non-null  int64  
 7   CrossingSignal           19464 non-null  int64  
 8   AttributeLevel           19464 non-null  object 
 9   ScenarioTypeStrict       19464 non-null  object 
 10  ScenarioType             19464 non-null  object 
 11  DefaultChoice            19464 non-null  object 
 12  NonDefaultChoice         19464 non-null  object 
 13  DefaultChoiceIsOmission  19464 non-null  float64
 14  NumberOfCharacters       19

In [3]:
# remove the first three columns and the last column
df_x = df.iloc[:, 3:-1]

In [4]:
# convert string to float
df_x = df_x.apply(lambda x: pd.to_numeric(x, errors='coerce'))

# deal with inf and -inf
df_x = df_x.replace([float('inf'), float('-inf')], float('nan'))

# deal with too large values
df_x = df_x.apply(lambda x: x.clip(lower=-1e10, upper=1e10))


In [6]:
df_x = df_x.dropna(axis=1)

In [5]:
# find the columns with strings
str_columns = df_x.select_dtypes(include=['object']).columns
str_columns

Index(['AttributeLevel', 'ScenarioTypeStrict', 'ScenarioType', 'DefaultChoice',
       'NonDefaultChoice', 'Template', 'UserCountry3', 'Review_education',
       'Review_gender', 'Review_income'],
      dtype='object')

In [8]:
# train machine learning model
# Saved column is the target column
df_y = df['Saved']
# drop the target column
df_x = df_x.drop('Saved', axis=1)

# split the data into training and testing data
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)

# random forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)
rf_predict = rf.predict(x_test)
rf_accuracy = accuracy_score(y_test, rf_predict)
print('Random Forest Accuracy: ', rf_accuracy)
print(classification_report(y_test, rf_predict))


Random Forest Accuracy:  0.6791677369637812
              precision    recall  f1-score   support

           0       0.68      0.68      0.68      1934
           1       0.68      0.68      0.68      1959

    accuracy                           0.68      3893
   macro avg       0.68      0.68      0.68      3893
weighted avg       0.68      0.68      0.68      3893



In [9]:
# try logistic regression
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_predict = lr.predict(x_test)
lr_accuracy = accuracy_score(y_test, lr_predict)
print('Logistic Regression Accuracy: ', lr_accuracy)
print(classification_report(y_test, lr_predict))

Logistic Regression Accuracy:  0.5032108913434369
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1934
           1       0.50      1.00      0.67      1959

    accuracy                           0.50      3893
   macro avg       0.25      0.50      0.33      3893
weighted avg       0.25      0.50      0.34      3893



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# try support vector machine
svm = SVC()
svm.fit(x_train, y_train)
svm_predict = svm.predict(x_test)
svm_accuracy = accuracy_score(y_test, svm_predict)
print('Support Vector Machine Accuracy: ', svm_accuracy)
print(classification_report(y_test, svm_predict))

Support Vector Machine Accuracy:  0.5032108913434369
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1934
           1       0.50      1.00      0.67      1959

    accuracy                           0.50      3893
   macro avg       0.25      0.50      0.33      3893
weighted avg       0.25      0.50      0.34      3893



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# confusion matrix
from sklearn.metrics import confusion_matrix



# try random forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)
rf_predict = rf.predict(x_test)

rf_accuracy = accuracy_score(y_test, rf_predict)
print('Random Forest Accuracy: ', rf_accuracy)
print(classification_report(y_test, rf_predict))

# confusion matrix
confusion_matrix(y_test, rf_predict)


Random Forest Accuracy:  0.6750577960441818
              precision    recall  f1-score   support

           0       0.67      0.68      0.67      1934
           1       0.68      0.67      0.68      1959

    accuracy                           0.68      3893
   macro avg       0.68      0.68      0.68      3893
weighted avg       0.68      0.68      0.68      3893



array([[1306,  628],
       [ 637, 1322]], dtype=int64)

In [None]:
# shap analysis

In [13]:
# decision tree
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt_predict = dt.predict(x_test)
dt_accuracy = accuracy_score(y_test, dt_predict)
print('Decision Tree Accuracy: ', dt_accuracy)
print(classification_report(y_test, dt_predict))

Decision Tree Accuracy:  0.6010788594913948
              precision    recall  f1-score   support

           0       0.60      0.59      0.60      1934
           1       0.60      0.61      0.61      1959

    accuracy                           0.60      3893
   macro avg       0.60      0.60      0.60      3893
weighted avg       0.60      0.60      0.60      3893

