# CAR PRICE PREDICTION (Source: kaggle)
A machine learning project to estimate car prices using features such as brand, year, engine size, mileage, fuel type, and transmission.  
It follows steps for preprocessing, model training with Support Vector Regression (SVR), and storing the model with Pickle.

---

## Libraries Used

---

### ▸ pandas (pd)
- Load and manage the dataset  
- Handle cleaning, missing values, and tabular data  

---

### ▸ scikit-learn (sklearn)
- **model_selection** → split dataset into train and test  
- **svm** → Support Vector Regression (SVR)  
- **preprocessing** → scaling numeric features, encoding categorical ones  
- **pipeline** → chain preprocessing and model into one workflow  

---

### ▸ pickle
- Save the trained model into `.pkl` format  
- Reload model later without retraining  
- Helps in deployment and reuse  




In [None]:
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


# CLASS 01 : DATA COLLECTION & PREPROCESSING


In [None]:
class Data:
    """
    Class: Data
    Purpose:
        - Step 1: Data Understanding
        - Step 2: Data Preprocessing
    """

    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None

    def load_data(self):
        """
        Function Name: load_data
        Purpose: Load dataset from CSV file
        """
        self.data = pd.read_csv(self.file_path)
        return self.data

    def head(self, n=5):
        """
        Function Name: head
        Purpose: Return top rows of dataset
        """
        return self.data.head(n)

    def describe(self):
        """
        Function Name: describe
        Purpose: Show summary statistics
        """
        return self.data.describe()

    def missing_values(self):
        """
        Function Name: missing_values
        Purpose: Show count of missing values
        """
        return self.data.isnull().sum()

    def duplicate_rows(self):
        """
        Function Name: duplicate_rows
        Purpose: Count duplicate rows
        """
        return self.data.duplicated().sum()

    def columns(self):
        """
        Function Name: columns
        Purpose: Return list of column names
        """
        return self.data.columns.tolist()

    def shape(self):
        """
        Function Name: shape
        Purpose: Return dataset shape
        """
        return self.data.shape

    def datatypes(self):
        """
        Function Name: datatypes
        Purpose: Return data types of columns
        """
        return self.data.dtypes

    def drop_duplicates(self):
        """
        Function Name: drop_duplicates
        Purpose: Remove duplicate rows
        """
        self.data.drop_duplicates(inplace=True)
        return self.data

    def drop_columns(self, cols):
        """
        Function Name: drop_columns
        Purpose: Drop unnecessary columns
        """
        self.data.drop(columns=cols, inplace=True, errors="ignore")
        return self.data

    def fill_missing_mean(self):
        """
        Function Name: fill_missing_mean
        Purpose: Fill numeric missing values with mean
        """
        self.data.fillna(self.data.mean(numeric_only=True), inplace=True)

    def fill_missing_mode(self):
        """
        Function Name: fill_missing_mode
        Purpose: Fill missing values with mode
        """
        self.data.fillna(self.data.mode().iloc[0], inplace=True)

    

    def get_data(self):
        """
        Function Name: get_data
        Purpose: Return processed dataset
        """
        return self.data



# CLASS 02 : DATA SPLITTING


In [12]:
class Model:
    """
    Class: Model
    Purpose: Splits the dataset into training and testing sets
    """

    def __init__(self, data, target_column):
        """
        Function Name: __init__
        Purpose: Initialize Model with dataset and target column
        """
        self.data = data
        self.target_column = target_column
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None

    def split(self, test_size=0.2, random_state=42):
        """
        Function Name: split
        Purpose: Split dataset into training and testing sets
        """
        X = self.data.drop(columns=[self.target_column])
        y = self.data[self.target_column]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        return self.X_train, self.X_test, self.y_train, self.y_test



# CLASS 03 : MODEL TRAINING (SVR)


In [None]:
class Price:
    """
    Class: Price
    Purpose: Train Support Vector Regression (SVR) model for car price prediction
    """

    def __init__(self, df):
        """
        Function Name: __init__
        Purpose: Initialize Price class and clean dataset
        """
        self.df = df.copy()
        self.model = None
        self.X_train = self.X_test = self.y_train = self.y_test = None

        # Drop unnecessary columns
        if "Car ID" in self.df.columns:
            self.df.drop("Car ID", axis=1, inplace=True)

    def split_data(self):
        """
        Function Name: split_data
        Purpose: Split dataset into training and testing sets
        """
        X = self.df.drop("Price", axis=1)
        y = self.df["Price"]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    def train_model(self):
        """
        Function Name: train_model
        Purpose: Train SVR model with StandardScaler pipeline
        """
        self.model = make_pipeline(StandardScaler(), SVR(kernel="rbf"))
        self.model.fit(self.X_train, self.y_train)
        print("Model R² Score:", self.model.score(self.X_test, self.y_test))
        return self.model

    def get_model(self):
        """
        Function Name: get_model
        Purpose: Return trained SVR model
        """
        return self.model

    def predict(self, X):
        """
        Function Name: predict
        Purpose: Predict car prices using trained SVR model
        """
        return self.model.predict(X)



# CLASS 04 : PICKLE SAVE & LOAD


In [14]:
import pickle

class Store:
    """
    Class: Store
    Purpose: Save and load trained model using pickle
    """

    def __init__(self, model):
        """
        Function Name: __init__
        Purpose: Initialize Store class with trained model
        """
        self.model = model

    def save(self, file_path="svm_car_model.pkl"):
        """
        Function Name: save
        Purpose: Save trained model to a pickle file
        """
        with open(file_path, "wb") as f:
            pickle.dump(self.model, f)

    def load(self, file_path="svm_car_model.pkl"):
        """
        Function Name: load
        Purpose: Load trained model from a pickle file
        """
        with open(file_path, "rb") as f:
            return pickle.load(f)


# MAIN EXECUTION :

# DATA LOAD

In [10]:


# Load the uploaded car price dataset
df = pd.read_csv("carpriceEXCEL.csv")

# Show first few rows and column details
df_head = df.head()
df_info = df.info()
df_columns = df.columns.tolist()

df_head, df_columns


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car ID        2500 non-null   int64  
 1   Brand         2500 non-null   object 
 2   Year          2500 non-null   int64  
 3   Engine Size   2500 non-null   float64
 4   Fuel Type     2500 non-null   object 
 5   Transmission  2500 non-null   object 
 6   Mileage       2500 non-null   int64  
 7   Condition     2500 non-null   object 
 8   Price         2500 non-null   float64
 9   Model         2500 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 195.4+ KB


(   Car ID  Brand  Year  Engine Size Fuel Type Transmission  Mileage Condition  \
 0       1  Tesla  2016          2.3    Petrol       Manual   114832       New   
 1       2    BMW  2018          4.4  Electric       Manual   143190      Used   
 2       3   Audi  2013          4.5  Electric       Manual   181601       New   
 3       4  Tesla  2011          4.1    Diesel    Automatic    68682       New   
 4       5   Ford  2009          2.6    Diesel       Manual   223009  Like New   
 
       Price     Model  
 0  26613.92   Model X  
 1  14679.61  5 Series  
 2  44402.61        A4  
 3  86374.33   Model Y  
 4  73577.10   Mustang  ,
 ['Car ID',
  'Brand',
  'Year',
  'Engine Size',
  'Fuel Type',
  'Transmission',
  'Mileage',
  'Condition',
  'Price',
  'Model'])

# STEP 01 : DATA UNDERSTANDING & PREPROCESSING

In [None]:
h = Data("carpriceEXCEL.csv")
df = h.load_data()
print("First rows:\n", h.head())
print("Missing values:\n", h.missing_values())
print("Duplicates:", h.duplicate_rows())
h.drop_columns(["Car ID"]) # Drop unused column
h.fill_missing_mean() 
h.fill_missing_mode() # Fill missing values if any
clean_data = h.encode_categorical()# Encode categorical columns
print("Processed Data:\n", clean_data.head())

First rows:
    Car ID  Brand  Year  Engine Size Fuel Type Transmission  Mileage Condition  \
0       1  Tesla  2016          2.3    Petrol       Manual   114832       New   
1       2    BMW  2018          4.4  Electric       Manual   143190      Used   
2       3   Audi  2013          4.5  Electric       Manual   181601       New   
3       4  Tesla  2011          4.1    Diesel    Automatic    68682       New   
4       5   Ford  2009          2.6    Diesel       Manual   223009  Like New   

      Price     Model  
0  26613.92   Model X  
1  14679.61  5 Series  
2  44402.61        A4  
3  86374.33   Model Y  
4  73577.10   Mustang  
Missing values:
 Car ID          0
Brand           0
Year            0
Engine Size     0
Fuel Type       0
Transmission    0
Mileage         0
Condition       0
Price           0
Model           0
dtype: int64
Duplicates: 0
Processed Data:
    Brand  Year  Engine Size  Fuel Type  Transmission  Mileage  Condition  \
0      5  2016          2.3          3 

  # STEP 02 : DATA SPLITTING

In [15]:
splitter = Model(clean_data, target_column="Price")
X_train, X_test, y_train, y_test = splitter.split()
print("X_train data:", X_train.shape)
print("X_test data:", X_test.shape)

X_train data: (2000, 8)
X_test data: (500, 8)


  # STEP 03 : MODEL TRAINING

In [30]:
model_trainer = Price(clean_data)
model_trainer.split_data()
trained_model = model_trainer.train_model()

Model R² Score: -1.539635878922141e-05


# STEP 04 : SAVE & LOAD MODEL

In [31]:
store_obj = Store(trained_model)
store_obj.save("svm_car_model.pkl")
loaded_model = store_obj.load("svm_car_model.pkl")
print("Model save successfully!")

Model save successfully!
