# Goal
### To create calssifications models preedicting the car type customer likely to buy based on features like price range, type, car brand, cylinders etc. And compare the results and performance of various classification models.
Models used:
1. Logistic Regression
2. XGBoost
3. and K-Nearest Neighbors

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv(r"C:\Users\varun\Desktop\b tech\sem 7\DVT\preprocessed_cars_sale_data.csv")

print(df.head(5))



    Make           Model   Type Origin DriveTrain     MSRP  Invoice  \
0  Acura             MDX    SUV   Asia        All  36945.0  33337.0   
1  Acura  RSX Type S 2dr  Sedan   Asia      Front  23820.0  21761.0   
2  Acura         TSX 4dr  Sedan   Asia      Front  26990.0  24647.0   
3  Acura          TL 4dr  Sedan   Asia      Front  33195.0  30299.0   
4  Acura      3.5 RL 4dr  Sedan   Asia      Front  43755.0  39014.0   

   EngineSize  Cylinders  Horsepower  MPG_City  MPG_Highway  Weight  \
0         3.5        6.0         265        17           23    4451   
1         2.0        4.0         200        24           31    2778   
2         2.4        4.0         200        22           29    3230   
3         3.2        6.0         270        20           28    3575   
4         3.5        6.0         225        18           24    3880   

   Wheelbase  Length  
0        106     189  
1        101     172  
2        105     183  
3        108     186  
4        115     197  


In [2]:
print("Number of missing values:\n", df.isnull().sum())

Number of missing values:
 Make           0
Model          0
Type           0
Origin         0
DriveTrain     0
MSRP           0
Invoice        0
EngineSize     0
Cylinders      0
Horsepower     0
MPG_City       0
MPG_Highway    0
Weight         0
Wheelbase      0
Length         0
dtype: int64


In [3]:
print('Data types and other info:\n',df.info())
print('Size of dataset:\n',df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 428 entries, 0 to 427
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Make         428 non-null    object 
 1   Model        428 non-null    object 
 2   Type         428 non-null    object 
 3   Origin       428 non-null    object 
 4   DriveTrain   428 non-null    object 
 5   MSRP         428 non-null    float64
 6   Invoice      428 non-null    float64
 7   EngineSize   428 non-null    float64
 8   Cylinders    428 non-null    float64
 9   Horsepower   428 non-null    int64  
 10  MPG_City     428 non-null    int64  
 11  MPG_Highway  428 non-null    int64  
 12  Weight       428 non-null    int64  
 13  Wheelbase    428 non-null    int64  
 14  Length       428 non-null    int64  
dtypes: float64(4), int64(6), object(5)
memory usage: 50.3+ KB
Data types and other info:
 None
Size of dataset:
 (428, 15)


#### Data is not having any null values

In [4]:
# Encoding and scaling the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

categorical_f = ['Make', 'Model', 'Type', 'Origin', 'DriveTrain']
df = pd.get_dummies(df,columns=categorical_f)

In [5]:
# Scaling
scaler = StandardScaler()
num_features = ['MSRP', 'Invoice', 'EngineSize', 'Cylinders', 'Horsepower', 'MPG_City', 'MPG_Highway', 'Weight', 'Wheelbase', 'Length']
df[num_features] = scaler.fit_transform(df[num_features])


#### Note: Since we want to predict the likelihood of a customer buying a car, we would ideally need data that indicates actual customer purchases or preferences.
* We don't have actual target varible which indicates actual purchase or not so we will create a proxy
* Assuming low priced cars are more likely to be purchased.
  

In [7]:
# Target varible
threshold_price = df['MSRP'].median()
# 1 = below thresshold likely too buy, 0 = unlikely
df['target'] = (df['MSRP'] < threshold_price).astype(int)

In [8]:
# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

In [10]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# shape of the splits
print(f"Training set: {X_train.shape[0]}")
print(f"Test set: {X_test.shape[0]}")


Training set: 342
Test set: 86


In [11]:
# Model Training
# 1. Logistic regression
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train,y_train)

In [14]:
# 2. Xgboost
from xgboost import XGBClassifier
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train,y_train)

In [13]:
# 3. KNN
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors=5)  
knn_model.fit(X_train, y_train)

In [20]:
from sklearn.metrics import accuracy_score, precision_score,recall_score, f1_score
def evaluate_model(model, X_test,y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")
    

In [21]:
# Evaluate
print("Logistic Regression Performance:")
evaluate_model(lr_model, X_test, y_test)

print("\nXGBoost Performance:")
evaluate_model(xgb_model, X_test, y_test)

print("\nK-Nearest Neighbors Performance:")
evaluate_model(knn_model, X_test, y_test)

Logistic Regression Performance:
Accuracy: 0.94
Precision: 0.88
Recall: 1.00
F1-Score: 0.94

XGBoost Performance:
Accuracy: 0.99
Precision: 0.97
Recall: 1.00
F1-Score: 0.99

K-Nearest Neighbors Performance:
Accuracy: 0.93
Precision: 0.86
Recall: 1.00
F1-Score: 0.93


##### Evaluation results
1. Logistic regression accuracy 94%
2. XGBoost accuracy 99%
3. KNN accuracy 93%

##### Therefore XGBoost is a best model for usecase

In [23]:
# define new data with all required features
new_data = pd.DataFrame({
    'Make': ['Toyota'],
    'Model': ['Camry'],
    'Type': ['Sedan'],
    'Origin': ['Asia'],
    'DriveTrain': ['Front'],
    'MSRP': [25000],
    'Invoice': [23000],
    'EngineSize': [2.5],
    'Cylinders': [4],
    'Horsepower': [200],
    'MPG_City': [22],
    'MPG_Highway': [30],
    'Weight': [3200],
    'Wheelbase': [110],
    'Length': [190]
})


new_data = pd.get_dummies(new_data, columns=['Make', 'Model', 'Type', 'Origin', 'DriveTrain'])

# Align the new_data columns with the training data's columns
new_data = new_data.reindex(columns=df.columns.drop('target'), fill_value=0)


logistic_prediction = lr_model.predict(new_data)
print("Logistic Regression Prediction:", logistic_prediction)

xgboost_prediction = xgb_model.predict(new_data)
print("XGBoost Prediction:", xgboost_prediction)

knn_prediction = knn_model.predict(new_data)
print("KNN Prediction:", knn_prediction)


predictions = {
    "Logistic Regression": logistic_prediction[0],
    "XGBoost": xgboost_prediction[0],
    "K-Nearest Neighbors": knn_prediction[0]
}
print("Comparison of Model Predictions:", predictions)


Logistic Regression Prediction: [0]
XGBoost Prediction: [0]
KNN Prediction: [0]
Comparison of Model Predictions: {'Logistic Regression': 0, 'XGBoost': 0, 'K-Nearest Neighbors': 0}
