In [32]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# Load and preprocess the data
data = pd.read_csv('Customer-survey-data.csv')

# Convert 'yes'/'no' in 'Was your order accurate?' column to binary 0/1
encoder = LabelEncoder()
data['Was your order accurate? Please respond yes or no.'] = encoder.fit_transform(
    data['Was your order accurate? Please respond yes or no.']
)

# Handle missing values by imputing with the mean
imputer = SimpleImputer(strategy='mean')
X = data.drop(['Customer', 'Was your order accurate? Please respond yes or no.'], axis=1)
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Define the target variable as binary
y = data['Was your order accurate? Please respond yes or no.'].apply(lambda x: 1 if x == 1 else 0)

# Feature Engineering: Select top features and add interaction terms
top_features = [
    'How satisfied were you with your overall delivery experience at Ali?                    1-5 where 1 = extremely dissatisfied and 5 = extremely satisfied',
    'How satisfied were you with the quality of the food at Alis?                             1-5 where 1 = extremely dissatisfied and 5 = extremely satisfied',
    'How satisfied were you with the speed of delivery at Alis?                                1-5 where 1 = extremely dissatisfied and 5 = extremely satisfied'
]
X = X[top_features]
X['delivery_quality_interaction'] = X[top_features[0]] * X[top_features[1]]
X['delivery_speed_interaction'] = X[top_features[0]] * X[top_features[2]]
X['quality_speed_interaction'] = X[top_features[1]] * X[top_features[2]]

# Standardize the data
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Apply SMOTE for balancing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
smote = SMOTE(random_state=0)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Logistic Regression Model with Cross-Validation
log_reg_model = LogisticRegression(class_weight='balanced', max_iter=1000)
stratified_kfold = StratifiedKFold(n_splits=5)

# Evaluate with cross-validation
cv_scores = cross_val_score(log_reg_model, X_train_smote, y_train_smote, cv=stratified_kfold, scoring='accuracy')

# Display cross-validation results
print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean CV Accuracy:", cv_scores.mean())
print("Standard Deviation of CV Accuracy:", cv_scores.std())


Cross-Validation Accuracy Scores: [0.49473684 0.50676692 0.49648947 0.49047141 0.50651956]
Mean CV Accuracy: 0.49899684014449364
Standard Deviation of CV Accuracy: 0.006543436041434585
