In [14]:
import pandas as pd           
import numpy as np
import seaborn as sns                       
import matplotlib.pyplot as plt             
import plotly.express as px                 
import plotly.graph_objects as go
import missingno as msno                          
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler  
from sklearn.model_selection import train_test_split 
import joblib


from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

%matplotlib inline
sns.set(color_codes=True)

In [15]:
df=pd.read_csv('data.csv')

In [16]:
import pandas as pd
import joblib

def preprocess_data(df):
    df = df.rename(columns={
        "Engine HP": "HP",
        "Engine Cylinders": "Cylinders",
        "Transmission Type": "Transmission",
        "Driven_Wheels": "Drive Mode",
        "highway MPG": "MPG-H",
        "city mpg": "MPG-C",
        "MSRP": "Price"
    })
    
    df['Cylinders'] = df['Cylinders'].fillna(value=df['Cylinders'].mode()[0])
    df['Engine Fuel Type'] = df['Engine Fuel Type'].fillna(df['Engine Fuel Type'].mode()[0])
    df['HP'] = df['HP'].fillna(df['HP'].mean())
    df['Number of Doors'] = df['Number of Doors'].fillna(df['Number of Doors'].mode()[0])
    

    df = df.drop(["Market Category"], axis=1)
   
    unique_flex_fuel_types = ['flex-fuel (unleaded/E85)', 'flex-fuel (premium unleaded recommended/E85)',
                              'flex-fuel (premium unleaded required/E85)', 'flex-fuel (unleaded/natural gas)']
    df['Engine Fuel Type'] = df['Engine Fuel Type'].replace(unique_flex_fuel_types, 'Flex fuel')
    
    unique_premium_fuel_types = ['premium unleaded (recommended)', 'premium unleaded (required)']
    df['Engine Fuel Type'] = df['Engine Fuel Type'].replace(unique_premium_fuel_types, 'Premium')
    df['Engine Fuel Type'] = df['Engine Fuel Type'].replace('regular unleaded', 'Regular')

    df['Transmission'] = df['Transmission'].replace('UNKNOWN', 'AUTOMATIC')
    df['Drive Mode'] = df['Drive Mode'].replace('four wheel drive', 'all wheel drive')
    
    df['Vehicle Style'] = df['Vehicle Style'].replace(['2dr Hatchback', '4dr Hatchback'], 'Hatchback')
    df['Vehicle Style'] = df['Vehicle Style'].replace(['Crew Cab Pickup', 'Regular Cab Pickup', 'Extended Cab Pickup'], 'Pickup')
    df['Vehicle Style'] = df['Vehicle Style'].replace(['4dr SUV', '2dr SUV', 'Convertible SUV'], 'SUV')
    df['Vehicle Style'] = df['Vehicle Style'].replace(['Cargo Van', 'Passenger Van'], 'Van')
    df['Vehicle Style'] = df['Vehicle Style'].replace(['Passenger Minivan', 'Cargo Minivan'], 'Minivan')
    
 
    df.loc[df['Transmission'] == 'DIRECT_DRIVE', 'Engine Fuel Type'] = 'Electric'
    df.loc[df['Transmission'] == 'DIRECT_DRIVE', 'Cylinders'] = 0
    
    current_year = 2024
    df['Age_of_Car'] = current_year - df['Year']
    
    return df


In [17]:

cleaned_data = preprocess_data(df)
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Make              11914 non-null  object 
 1   Model             11914 non-null  object 
 2   Year              11914 non-null  int64  
 3   Engine Fuel Type  11914 non-null  object 
 4   HP                11914 non-null  float64
 5   Cylinders         11914 non-null  float64
 6   Transmission      11914 non-null  object 
 7   Drive Mode        11914 non-null  object 
 8   Number of Doors   11914 non-null  float64
 9   Vehicle Size      11914 non-null  object 
 10  Vehicle Style     11914 non-null  object 
 11  MPG-H             11914 non-null  int64  
 12  MPG-C             11914 non-null  int64  
 13  Popularity        11914 non-null  int64  
 14  Price             11914 non-null  int64  
 15  Age_of_Car        11914 non-null  int64  
dtypes: float64(3), int64(6), object(7)
memor

In [18]:
cleaned_data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,HP,Cylinders,Transmission,Drive Mode,Number of Doors,Vehicle Size,Vehicle Style,MPG-H,MPG-C,Popularity,Price,Age_of_Car
0,BMW,1 Series M,2011,Premium,335.0,6.0,MANUAL,rear wheel drive,2.0,Compact,Coupe,26,19,3916,46135,13
1,BMW,1 Series,2011,Premium,300.0,6.0,MANUAL,rear wheel drive,2.0,Compact,Convertible,28,19,3916,40650,13
2,BMW,1 Series,2011,Premium,300.0,6.0,MANUAL,rear wheel drive,2.0,Compact,Coupe,28,20,3916,36350,13
3,BMW,1 Series,2011,Premium,230.0,6.0,MANUAL,rear wheel drive,2.0,Compact,Coupe,28,18,3916,29450,13
4,BMW,1 Series,2011,Premium,230.0,6.0,MANUAL,rear wheel drive,2.0,Compact,Convertible,28,18,3916,34500,13


In [26]:
cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11914 entries, 0 to 11913
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Make              11914 non-null  object 
 1   Model             11914 non-null  object 
 2   Year              11914 non-null  int64  
 3   Engine Fuel Type  11914 non-null  object 
 4   HP                11914 non-null  float64
 5   Cylinders         11914 non-null  float64
 6   Transmission      11914 non-null  object 
 7   Drive Mode        11914 non-null  object 
 8   Number of Doors   11914 non-null  float64
 9   Vehicle Size      11914 non-null  object 
 10  Vehicle Style     11914 non-null  object 
 11  MPG-H             11914 non-null  int64  
 12  MPG-C             11914 non-null  int64  
 13  Popularity        11914 non-null  int64  
 14  Price             11914 non-null  int64  
 15  Age_of_Car        11914 non-null  int64  
dtypes: float64(3), int64(6), object(7)
memor

In [20]:
X = cleaned_data.drop('Price' , axis=1)  # Features
y = cleaned_data['Price']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [117]:
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (9531, 996)
Shape of X_test: (2383, 996)
Shape of y_train: (9531,)
Shape of y_test: (2383,)


In [27]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

def preprocessing_pipeline(df):
    categorical_cols = ['Make', 'Model', 'Engine Fuel Type', 'Transmission', 'Drive Mode', 'Vehicle Size', 'Vehicle Style']
    
    preprocessing = ColumnTransformer(transformers=[
        ('ohe', OneHotEncoder(), categorical_cols),  # One-hot encode categorical columns
        ('scaler', StandardScaler(), ['Year', 'HP', 'Cylinders', 'Number of Doors', 'MPG-H', 'MPG-C', 'Popularity', 'Age_of_Car'])  # Scale numerical columns
    ])
    
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing)
    ])
    
    X = df.drop('Price', axis=1)
    pipeline.fit(X)
    
    joblib.dump(pipeline, 'preprocessing_pipeline.joblib')
    
    return pipeline


In [28]:
preprocessing_pipeline(cleaned_data)

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(transformers=[('ohe', OneHotEncoder(),
                                                  ['Make', 'Model',
                                                   'Engine Fuel Type',
                                                   'Transmission', 'Drive Mode',
                                                   'Vehicle Size',
                                                   'Vehicle Style']),
                                                 ('scaler', StandardScaler(),
                                                  ['Year', 'HP', 'Cylinders',
                                                   'Number of Doors', 'MPG-H',
                                                   'MPG-C', 'Popularity',
                                                   'Age_of_Car'])]))])

In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

def rf_model_pipeline(df):
    preprocessing = joblib.load('preprocessing_pipeline.joblib')
    
    X = df.drop('Price', axis=1)
    y = df['Price']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    X_train_transformed = preprocessing.transform(X_train)
    X_test_transformed = preprocessing.transform(X_test)
    
    rf_model = RandomForestRegressor(n_estimators=50, random_state=42)
    rf_model.fit(X_train_transformed, y_train)
    
    joblib.dump(rf_model, 'rf_model.joblib')
    
    y_pred = rf_model.predict(X_test_transformed)
    
    return y_pred



In [31]:
y_pred = rf_model_pipeline(cleaned_data)

In [32]:
def preprocess_and_predict(new_input_df):
    preprocessing = joblib.load('preprocessing_pipeline.joblib')
    rf_model = joblib.load('rf_model.joblib')
    
    new_input_transformed = preprocessing.transform(new_input_df)
    
    predictions = rf_model.predict(new_input_transformed)
    
    return predictions



In [33]:
import pandas as pd

new_input_data = {
    'Make': ['BMW'],
    'Model': ['1 Series'],
    'Year': [2012],
    'Engine Fuel Type': ['Premium'],
    'HP': [320.0],
    'Cylinders': [6.0],
    'Transmission': ['MANUAL'],
    'Drive Mode': ['rear wheel drive'],
    'Number of Doors': [2.0],
    'Vehicle Size': ['Compact'],
    'Vehicle Style': ['Coupe'],
    'MPG-H': [30],
    'MPG-C': [22],
    'Popularity': [3916],
    'Age_of_Car': [12]
}

new_input_df = pd.DataFrame(new_input_data)


In [34]:
prediction = preprocess_and_predict(new_input_df)
print("Predicted price:", prediction)

Predicted price: [46874.6]


In [103]:
joblib.dump(rf_model, 'rf_model.joblib')

['rf_model.joblib']