In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load the dataset
df = pd.read_csv("car_sales_data.csv")

In [2]:
df.loc[:,'make'] = df.loc[:,['make']].fillna(df['make'].mode()[0])
df.loc[:,'model'] = df.loc[:,['model']].fillna(df['model'].mode()[0])
df.loc[:,'body'] = df.loc[:,['body']].fillna(df['body'].mode()[0])
df.loc[:,'trim'] = df.loc[:,['trim']].fillna(df['trim'].mode()[0])
df.loc[:,'color'] = df.loc[:,['color']].fillna(df['color'].mode()[0])
df.loc[:,'interior'] = df.loc[:,['interior']].fillna(df['interior'].mode()[0])
df.loc[:,'transmission'] = df.loc[:,['transmission']].fillna(df['transmission'].mode()[0])

df.loc[:,'condition'] = df.loc[:,['condition']].fillna(df['condition'].mean())
df.loc[:,'odometer'] = df.loc[:,['odometer']].fillna(df['odometer'].mean())
df.loc[:,'mmr'] = df.loc[:,['mmr']].fillna(df['mmr'].mean())

df.dropna(subset=['sellingprice'],axis='rows',inplace=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59997 entries, 0 to 59996
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          59997 non-null  int64  
 1   make          59997 non-null  object 
 2   model         59997 non-null  object 
 3   trim          59997 non-null  object 
 4   body          59997 non-null  object 
 5   transmission  59997 non-null  object 
 6   vin           59997 non-null  object 
 7   state         59997 non-null  object 
 8   condition     59997 non-null  float64
 9   odometer      59997 non-null  float64
 10  color         59997 non-null  object 
 11  interior      59997 non-null  object 
 12  seller        59997 non-null  object 
 13  mmr           59997 non-null  float64
 14  sellingprice  59997 non-null  float64
 15  saledate      59997 non-null  object 
dtypes: float64(4), int64(1), object(11)
memory usage: 7.8+ MB


In [4]:
import numpy as np

df['is_automatic'] = np.where(df['transmission'] == 'automatic', 1, 0)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59997 entries, 0 to 59996
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          59997 non-null  int64  
 1   make          59997 non-null  object 
 2   model         59997 non-null  object 
 3   trim          59997 non-null  object 
 4   body          59997 non-null  object 
 5   transmission  59997 non-null  object 
 6   vin           59997 non-null  object 
 7   state         59997 non-null  object 
 8   condition     59997 non-null  float64
 9   odometer      59997 non-null  float64
 10  color         59997 non-null  object 
 11  interior      59997 non-null  object 
 12  seller        59997 non-null  object 
 13  mmr           59997 non-null  float64
 14  sellingprice  59997 non-null  float64
 15  saledate      59997 non-null  object 
 16  is_automatic  59997 non-null  int32  
dtypes: float64(4), int32(1), int64(1), object(11)
memory usage: 8.0+ MB


In [6]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Data Preparation
# Assuming you've already defined X, y, X_train, X_test, y_train, y_test
X = df[['year', 'condition', 'odometer', 'mmr', 'sellingprice']]  # Features
y = df['is_automatic']  # Target variable (assuming 'District' is the classification label)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Choose Classifiers
classifiers = {
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Support Vector Machines (SVMs)": SVC(),
    "Decision Tree": DecisionTreeClassifier()
}

# Initialize a dictionary to store results
results_dict = {'Classifier': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': [], 'Confusion Matrix': []}

# Evaluate Performance
for name, classifier in classifiers.items():
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluation Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Append results to dictionary
    results_dict['Classifier'].append(name)
    results_dict['Accuracy'].append(accuracy)
    results_dict['Precision'].append(precision)
    results_dict['Recall'].append(recall)
    results_dict['F1 Score'].append(f1)
    results_dict['Confusion Matrix'].append(conf_matrix)
    
    # Print results
    print(f"Classifier: {name}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Confusion Matrix:\n{conf_matrix}")

# Convert dictionary to DataFrame
results_df = pd.DataFrame(results_dict)




Classifier: K-Nearest Neighbors (KNN)
Accuracy: 0.9621666666666666
Precision: 0.9274968860344205
Recall: 0.9621666666666666
F1 Score: 0.9445137319856168
Confusion Matrix:
[[    0   443]
 [   11 11546]]
Classifier: Naive Bayes
Accuracy: 0.9630833333333333
Precision: 0.9275295069444445
Recall: 0.9630833333333333
F1 Score: 0.9449721172758274
Confusion Matrix:
[[    0   443]
 [    0 11557]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classifier: Support Vector Machines (SVMs)
Accuracy: 0.9630833333333333
Precision: 0.9275295069444445
Recall: 0.9630833333333333
F1 Score: 0.9449721172758274
Confusion Matrix:
[[    0   443]
 [    0 11557]]
Classifier: Decision Tree
Accuracy: 0.92175
Precision: 0.9290661814309317
Recall: 0.92175
F1 Score: 0.9253773935319753
Confusion Matrix:
[[   21   422]
 [  517 11040]]


In [8]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# Data Preparation
# Assuming you've already defined X, y, X_train, X_test, y_train, y_test
X = df[['year', 'condition', 'odometer', 'mmr', 'sellingprice']]  # Features
y = df['is_automatic']  # Target variable (assuming 'District' is the classification label)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Choose Classifiers
classifiers = {
    "K-Nearest Neighbors (KNN)": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Support Vector Machines (SVMs)": SVC(),
    "Decision Tree": DecisionTreeClassifier()
}

# Initialize a dictionary to store results
results_dict = {'Classifier': [], 'Accuracy': [], 'Precision': [], 'Recall': [], 'F1 Score': [], 'Confusion Matrix': []}

# Evaluate Performance
for name, classifier in classifiers.items():
    # Train the classifier
    classifier.fit(X_train, y_train)
    
    # Predictions
    y_pred = classifier.predict(X_test)
    
    # Evaluation Metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    conf_matrix = confusion_matrix(y_test, y_pred)
    
    # Append results to dictionary
    results_dict['Classifier'].append(name)
    results_dict['Accuracy'].append(accuracy)
    results_dict['Precision'].append(precision)
    results_dict['Recall'].append(recall)
    results_dict['F1 Score'].append(f1)
    results_dict['Confusion Matrix'].append(conf_matrix)
    
    # Print results
    print(f"Classifier: {name}")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"Confusion Matrix:\n{conf_matrix}")

# Convert dictionary to DataFrame
results_df = pd.DataFrame(results_dict)




Classifier: K-Nearest Neighbors (KNN)
Accuracy: 0.9621666666666666
Precision: 0.9274968860344205
Recall: 0.9621666666666666
F1 Score: 0.9445137319856168
Confusion Matrix:
[[    0   443]
 [   11 11546]]
Classifier: Naive Bayes
Accuracy: 0.9630833333333333
Precision: 0.9275295069444445
Recall: 0.9630833333333333
F1 Score: 0.9449721172758274
Confusion Matrix:
[[    0   443]
 [    0 11557]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classifier: Support Vector Machines (SVMs)
Accuracy: 0.9630833333333333
Precision: 0.9275295069444445
Recall: 0.9630833333333333
F1 Score: 0.9449721172758274
Confusion Matrix:
[[    0   443]
 [    0 11557]]
Classifier: Decision Tree
Accuracy: 0.9236666666666666
Precision: 0.9292015450010075
Recall: 0.9236666666666666
F1 Score: 0.926415882177455
Confusion Matrix:
[[   21   422]
 [  494 11063]]


In [9]:
# Display accuracy DataFrame
print("\nAccuracy DataFrame:")
results_df


Accuracy DataFrame:


Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1 Score,Confusion Matrix
0,K-Nearest Neighbors (KNN),0.962167,0.927497,0.962167,0.944514,"[[0, 443], [11, 11546]]"
1,Naive Bayes,0.963083,0.92753,0.963083,0.944972,"[[0, 443], [0, 11557]]"
2,Support Vector Machines (SVMs),0.963083,0.92753,0.963083,0.944972,"[[0, 443], [0, 11557]]"
3,Decision Tree,0.923667,0.929202,0.923667,0.926416,"[[21, 422], [494, 11063]]"


In [10]:
results_dict

{'Classifier': ['K-Nearest Neighbors (KNN)',
  'Naive Bayes',
  'Support Vector Machines (SVMs)',
  'Decision Tree'],
 'Accuracy': [0.9621666666666666,
  0.9630833333333333,
  0.9630833333333333,
  0.9236666666666666],
 'Precision': [0.9274968860344205,
  0.9275295069444445,
  0.9275295069444445,
  0.9292015450010075],
 'Recall': [0.9621666666666666,
  0.9630833333333333,
  0.9630833333333333,
  0.9236666666666666],
 'F1 Score': [0.9445137319856168,
  0.9449721172758274,
  0.9449721172758274,
  0.926415882177455],
 'Confusion Matrix': [array([[    0,   443],
         [   11, 11546]], dtype=int64),
  array([[    0,   443],
         [    0, 11557]], dtype=int64),
  array([[    0,   443],
         [    0, 11557]], dtype=int64),
  array([[   21,   422],
         [  494, 11063]], dtype=int64)]}