In [13]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


In [4]:
car_data=pd.read_csv('car.csv')

In [5]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(car_data, test_size=0.2, random_state=42, stratify=car_data['Car_Acceptability'])

# Display the size of the training and testing sets
train_size = train_data.shape[0]
test_size = test_data.shape[0]

train_size, test_size

(1382, 346)

In [7]:
# Separate features (X) and target (y) for both training and testing sets
X_train = train_data.drop(columns=['Car_Acceptability'])
y_train = train_data['Car_Acceptability']
X_test = test_data.drop(columns=['Car_Acceptability'])
y_test = test_data['Car_Acceptability']

In [9]:
# Define categorical and ordinal columns (excluding the target column)
categorical_cols = ['Person_Capacity', 'Size_of_Luggage']
ordinal_cols = ['Buying_Price', 'Maintenance_Price', 'No_of_Doors', 'Safety']

# Define the ordering for ordinal encoding
ordinal_categories = [
    ['low', 'med', 'high', 'vhigh'],  # Buying_Price
    ['low', 'med', 'high', 'vhigh'],  # Maintenance_Price
    ['2', '3', '4', '5more'],         # No_of_Doors
    ['low', 'med', 'high']            # Safety
]

# Preprocessing pipelines for both categorical and ordinal columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_cols),
        ('ord', Pipeline([
            ('ordinal', OrdinalEncoder(categories=ordinal_categories)),
            ('scaler', StandardScaler())
        ]), ordinal_cols)
    ],
    remainder='passthrough'
)

# Fit and transform the training features
X_train_processed = preprocessor.fit_transform(X_train)

# Transform the testing features
X_test_processed = preprocessor.transform(X_test)

X_train_processed.shape, X_test_processed.shape

((1382, 10), (346, 10))

In [12]:
X_train_processed

array([[ 0.        ,  1.        ,  0.        , ..., -0.41700877,
        -0.42542184, -0.0043986 ],
       [ 1.        ,  0.        ,  0.        , ...,  0.48206474,
        -0.42542184, -0.0043986 ],
       [ 0.        ,  0.        ,  1.        , ...,  0.48206474,
         1.35349793, -0.0043986 ],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  1.38113826,
        -1.31488173,  1.21137419],
       [ 1.        ,  0.        ,  0.        , ..., -1.31608229,
         0.46403805, -0.0043986 ],
       [ 0.        ,  1.        ,  0.        , ..., -1.31608229,
         0.46403805, -0.0043986 ]])

In [14]:
# Define models to train
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Support Vector Machine": SVC(random_state=42)
}

In [15]:
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

# Function to evaluate models using cross-validation, accuracy, and precision
def evaluate_model_with_metrics(model, X_train, y_train, X_test, y_test):
    # Cross-validation for accuracy
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    print(f"{model.__class__.__name__} CV Accuracy: {scores.mean():.4f} (+/- {scores.std():.4f})")

    # Train the model
    model.fit(X_train, y_train)

    # Predict on test set
    y_test_pred = model.predict(X_test)

    # Calculate metrics
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_precision = precision_score(y_test, y_test_pred, average='weighted', zero_division=0)

    print(f"{model.__class__.__name__} Test Accuracy: {test_accuracy:.4f}")
    print(f"{model.__class__.__name__} Test Precision: {test_precision:.4f}\n")

    return test_accuracy, test_precision



In [16]:
# Train and evaluate each model
model_performance = {}

for name, model in models.items():
    print(f"Training and evaluating {name}...")
    test_accuracy, test_precision = evaluate_model_with_metrics(model, X_train_processed, y_train, X_test_processed, y_test)

    # Store results
    model_performance[name] = {'Accuracy': test_accuracy, 'Precision': test_precision}

model_performance

Training and evaluating Logistic Regression...
LogisticRegression CV Accuracy: 0.8669 (+/- 0.0106)
LogisticRegression Test Accuracy: 0.8728
LogisticRegression Test Precision: 0.8678

Training and evaluating Decision Tree...
DecisionTreeClassifier CV Accuracy: 0.9768 (+/- 0.0109)
DecisionTreeClassifier Test Accuracy: 0.9913
DecisionTreeClassifier Test Precision: 0.9916

Training and evaluating Random Forest...
RandomForestClassifier CV Accuracy: 0.9638 (+/- 0.0138)
RandomForestClassifier Test Accuracy: 0.9942
RandomForestClassifier Test Precision: 0.9944

Training and evaluating Gradient Boosting...
GradientBoostingClassifier CV Accuracy: 0.9891 (+/- 0.0023)
GradientBoostingClassifier Test Accuracy: 0.9855
GradientBoostingClassifier Test Precision: 0.9864

Training and evaluating Support Vector Machine...
SVC CV Accuracy: 0.9602 (+/- 0.0033)
SVC Test Accuracy: 0.9711
SVC Test Precision: 0.9727



{'Logistic Regression': {'Accuracy': 0.8728323699421965,
  'Precision': 0.8677694901404769},
 'Decision Tree': {'Accuracy': 0.9913294797687862,
  'Precision': 0.9915962650066696},
 'Random Forest': {'Accuracy': 0.9942196531791907,
  'Precision': 0.9943659910733885},
 'Gradient Boosting': {'Accuracy': 0.9855491329479769,
  'Precision': 0.9864302833779783},
 'Support Vector Machine': {'Accuracy': 0.9710982658959537,
  'Precision': 0.9726971348280392}}

In [17]:
import joblib

# Identify the best model based on test accuracy
best_model_name = max(model_performance, key=lambda name: model_performance[name]['Accuracy'])
best_model = models[best_model_name]

# Train the best model on the entire training set
best_model.fit(X_train_processed, y_train)

# Save the best model to a file
model_filename = f"best_model_{best_model_name.replace(' ', '_').lower()}.pkl"
joblib.dump(best_model, model_filename)

model_filename

'best_model_random_forest.pkl'