In [1]:
# 导入必要的库
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten
import numpy as np
from tensorflow.keras.layers import MaxPooling1D, Dropout
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [2]:
# 读取数据
file_path = 'churnnotclean.csv'
df = pd.read_csv(file_path)

In [3]:
# 创建新变量 Duration 和 Surge_interation
df['last_trip_date'] = pd.to_datetime(df['last_trip_date'], errors='coerce')
df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')

# 计算 Duration
df['Duration'] = (df['last_trip_date'] - df['signup_date']).dt.days

# 计算 Surge_interation
df['Surge_interation'] = df['avg_surge'] * df['surge_pct']

  df['last_trip_date'] = pd.to_datetime(df['last_trip_date'], errors='coerce')
  df['signup_date'] = pd.to_datetime(df['signup_date'], errors='coerce')


In [4]:
# 分离数值型和类别型列
numeric_columns = ['avg_rating_by_driver', 'avg_dist', 'avg_surge', 'surge_pct', 'trips_in_first_30_days', 'weekday_pct']  # 根据实际列名填写数值型列
category_columns = ['city', 'phone', 'luxury_car_user']  # 根据实际列名填写类别型列

# 处理数值型列的缺失值：使用均值填充
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')  # 将所有数值列转换为数值类型
    df[column].fillna(df[column].mean(), inplace=True)

# 处理类别型列的缺失值：使用众数填充
for column in category_columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

# 对类别型变量进行标签编码（将字符串转为数值）
label_encoder = LabelEncoder()
for column in category_columns:
    df[column] = label_encoder.fit_transform(df[column])

In [5]:
# # 筛选所需的特征
df_filtered = df[['avg_rating_by_driver','city', 'phone', 'luxury_car_user', 'surge_pct','churn','avg_dist','weekday_pct','Duration','Surge_interation']]

In [6]:
# # 处理缺失值
# df_filtered.fillna(method='ffill', inplace=True)

# # 编码类别变量
# label_encoder = LabelEncoder()
# df_filtered['city'] = label_encoder.fit_transform(df_filtered['city'])
# df_filtered['phone'] = label_encoder.fit_transform(df_filtered['phone'])


In [7]:
# 特征矩阵 X 和目标向量 y
X = df_filtered.drop(columns=['churn'])
y = df_filtered['churn']

# 数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 特征标准化
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 将数据重塑为 2D 以适应 Conv1D 层
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)


In [8]:
# 构建CNN模型
model = Sequential()
model.add(Conv1D(32, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
# 编译模型
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 训练模型
model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1)

# 进行模型预测
y_pred_prob = model.predict(X_test).ravel()
y_pred_class = (y_pred_prob > 0.5).astype(int)

# 评估模型：准确率、分类报告和ROC-AUC分数
accuracy = accuracy_score(y_test, y_pred_class)
classification_rep = classification_report(y_test, y_pred_class)
roc_auc = roc_auc_score(y_test, y_pred_prob)

Epoch 1/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step - accuracy: 0.7982 - loss: 0.4420
Epoch 2/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8354 - loss: 0.3701
Epoch 3/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8419 - loss: 0.3572
Epoch 4/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8470 - loss: 0.3480
Epoch 5/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8449 - loss: 0.3519
Epoch 6/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8486 - loss: 0.3443
Epoch 7/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8463 - loss: 0.3462
Epoch 8/10
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.8486 - loss: 0.3408
Epoch 9/10
[1m1250/1250

In [10]:
 # 打印评估结果
print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:\\n', classification_rep)
print(f'ROC-AUC: {roc_auc:.4f}') 

Accuracy: 0.8466
Classification Report:\n               precision    recall  f1-score   support

           0       0.91      0.75      0.82      4720
           1       0.81      0.93      0.86      5280

    accuracy                           0.85     10000
   macro avg       0.86      0.84      0.84     10000
weighted avg       0.85      0.85      0.84     10000

ROC-AUC: 0.9091
