In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Read the training and test data
train_data = pd.read_csv('YourCabs_training.csv')
test_data = pd.read_csv('YourCabs_score.csv')

# Concatenate the training and test data
data = pd.concat([train_data, test_data], ignore_index=True)

# Convert date columns to datetime
date_columns = ['from_date', 'to_date', 'booking_created']
data[date_columns] = data[date_columns].apply(pd.to_datetime)

# Impute missing values with the most frequent strategy
imputer = SimpleImputer(strategy='most_frequent')
columns_to_impute = ['from_area_id', 'to_area_id', 'from_city_id', 'to_city_id']
data[columns_to_impute] = imputer.fit_transform(data[columns_to_impute])

# Fill missing values in 'package_id' with 0
data['package_id'].fillna(0, inplace=True)

# Encode categorical columns using label encoding
categorical_cols = ['vehicle_model_id', 'travel_type_id', 'from_area_id', 'to_area_id', 'from_city_id', 'to_city_id']
label_encoder = LabelEncoder()
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col].astype(str))

# Split the data back into train and test sets
train_data = data[~data['Car_Cancellation'].isnull()]
test_data = data[data['Car_Cancellation'].isnull()]

# Define features and target variable
features = ['vehicle_model_id', 'travel_type_id', 'from_area_id', 'to_area_id', 'from_city_id', 'to_city_id', 'online_booking', 'mobile_site_booking']
target = 'Car_Cancellation'
X_train = train_data[features]
y_train = train_data[target]

# Split the training data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train a neural network classifier
nn_classifier = MLPClassifier(random_state=42)
nn_classifier.fit(X_train, y_train)
nn_predictions = nn_classifier.predict(X_val)
nn_accuracy = accuracy_score(y_val, nn_predictions)
print("Neural Network Accuracy:", nn_accuracy)

# Train a Logistic Regression classifier
lr_classifier = LogisticRegression(random_state=42)
lr_classifier.fit(X_train, y_train)
lr_predictions = lr_classifier.predict(X_val)
lr_accuracy = accuracy_score(y_val, lr_predictions)
print("Logistic Regression Accuracy:", lr_accuracy)

# Evaluate the Logistic Regression classifier
cm = confusion_matrix(y_val, lr_predictions)
print("Confusion Matrix:")
print(cm)
report = classification_report(y_val, lr_predictions)
print("Classification Report:")
print(report)


  mode = stats.mode(array)


Neural Network Accuracy: 0.914354783009094
Logistic Regression Accuracy: 0.9269022677564176
Confusion Matrix:
[[8052    0]
 [ 635    0]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96      8052
         1.0       0.00      0.00      0.00       635

    accuracy                           0.93      8687
   macro avg       0.46      0.50      0.48      8687
weighted avg       0.86      0.93      0.89      8687



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
