# In summary,
this code loads a dataset of car information, preprocesses the data, builds a linear regression model to predict car prices, and evaluates the model's performance using metrics like R-squared and mean absolute error. The dataset is prepared with one-hot encoding for categorical variables, and numerical features are selected for modeling.

# Import Libraries:
The code begins by importing necessary libraries: pandas, matplotlib.pyplot, seaborn, and numpy.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Load the Dataset:

car_dataset = pd.read_csv("cars.csv"): This line reads a CSV file called "cars.csv" and stores it in a pandas DataFrame named car_dataset.

In [2]:
car_dataset = pd.read_csv("cars.csv")

#
car_dataset.head(10): It displays the first 10 rows of the dataset.

In [3]:
car_dataset.head(10)

Unnamed: 0,Id,year,brand,full_model_name,model_name,price,distance_travelled(kms),fuel_type,city,brand_rank,car_age
0,0,2016,Honda,Honda Brio S MT,Brio,425000.0,9680.0,Petrol,Mumbai,7,5.0
1,1,2012,Nissan,Nissan Sunny XV Diesel,Sunny,325000.0,119120.0,Diesel,Mumbai,11,9.0
2,2,2017,Toyota,Toyota Fortuner 2.8 4x2 MT [2016-2020],Fortuner,2650000.0,64593.0,Diesel,Thane,1,4.0
3,3,2017,Mercedes-Benz,Mercedes-Benz E-Class E 220d Expression [2019-...,E-Class,4195000.0,25000.0,Diesel,Mumbai,2,4.0
4,4,2012,Hyundai,Hyundai Verna Fluidic 1.6 CRDi SX,Verna,475000.0,23800.0,Diesel,Mumbai,14,9.0
5,5,2012,Hyundai,Hyundai i20 Sportz 1.2 BS-IV,i20,335000.0,45000.0,Petrol,Mumbai,14,9.0
6,6,2019,Toyota,Toyota Glanza V,Glanza,750000.0,19500.0,Petrol,Mumbai,1,2.0
7,7,2018,Mercedes-Benz,Mercedes-Benz GLE 250 d,GLE,5500000.0,32000.0,Diesel,Mumbai,2,3.0
8,8,2017,Hyundai,Hyundai Grand i10 Sportz (O) AT 1.2 Kappa VTVT...,Grand,565000.0,44329.0,Petrol,Mumbai,14,4.0
9,9,2015,Maruti Suzuki,Maruti Suzuki Swift Dzire ZXI,Swift,510000.0,48286.0,Petrol,Mumbai,32,6.0


In [4]:
car_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1725 entries, 0 to 1724
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Id                       1725 non-null   int64  
 1   year                     1725 non-null   int64  
 2   brand                    1725 non-null   object 
 3   full_model_name          1725 non-null   object 
 4   model_name               1725 non-null   object 
 5   price                    1725 non-null   float64
 6   distance_travelled(kms)  1725 non-null   float64
 7   fuel_type                1725 non-null   object 
 8   city                     1725 non-null   object 
 9   brand_rank               1725 non-null   int64  
 10  car_age                  1725 non-null   float64
dtypes: float64(3), int64(3), object(5)
memory usage: 148.4+ KB


In [5]:
car_dataset.shape

(1725, 11)

# Data Selection:

y = car_dataset["price"]: Creates a target variable y containing car prices.

car_dataset.columns: Lists the column names in the dataset.

x = car_dataset[["year", "distance_travelled(kms)", "brand_rank", "car_age"]]: Selects specific features (year, distance traveled, brand rank, and car age) and stores them in a DataFrame called x.

In [6]:
y = car_dataset["price"]

In [7]:
y

0        425000.0
1        325000.0
2       2650000.0
3       4195000.0
4        475000.0
          ...    
1720     290000.0
1721    7500000.0
1722     185000.0
1723     325000.0
1724    1395000.0
Name: price, Length: 1725, dtype: float64

In [8]:
car_dataset.columns  

Index(['Id', 'year', 'brand', 'full_model_name', 'model_name', 'price',
       'distance_travelled(kms)', 'fuel_type', 'city', 'brand_rank',
       'car_age'],
      dtype='object')

In [9]:
x = car_dataset[["year","distance_travelled(kms)","brand_rank","car_age"]]

In [10]:
x

Unnamed: 0,year,distance_travelled(kms),brand_rank,car_age
0,2016,9680.0,7,5.0
1,2012,119120.0,11,9.0
2,2017,64593.0,1,4.0
3,2017,25000.0,2,4.0
4,2012,23800.0,14,9.0
...,...,...,...,...
1720,2015,38000.0,14,6.0
1721,2011,36000.0,44,10.0
1722,2008,142522.0,24,13.0
1723,1990,18581.0,24,31.0


In [None]:
# Another method for categorical variable
# car_dataset.replace({'fuel_type':{'Petrol':0,'Diesel':1,'CNG + 1':2,'Petrol + 1':3,'Hybrid':4}},inplace=True)
# car_dataset.replace({'city':{'Chennai':0,'Bangalore':1,'Pune':2,'Hyderabad':3,'Delhi':4,'Mumbai':5,'Dehradun':6,'Noida':7,'Ghaziabad':8,'Faridabad':9,'Panchkula':10,'Agra':11,'Thane':12,'Navi Mumbai':13,'Lucknow':14}},inplace=True)

# One-Hot Encoding:

Brand_dummy = pd.get_dummies(Brand): Converts the "brand" column into one-hot encoded dummy variables.

final_Brand_dummy = pd.get_dummies(car_dataset["brand"], drop_first=True): Creates one-hot encoded dummies for "brand" while dropping the first category to avoid multicollinearity.

Similar one-hot encoding is done for "fuel_type" and "city" columns.

In [11]:
Brand = car_dataset["brand"]

In [12]:
Brand_dummy = pd.get_dummies(Brand)

In [13]:
Brand_dummy

Unnamed: 0,Audi,BMW,Bentley,Chevrolet,Datsun,Fiat,Ford,Honda,Hyundai,Isuzu,...,Mercedes-Benz,Mitsubishi,Nissan,Porsche,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1721,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
final_Brand_dummy=pd.get_dummies(car_dataset["brand"],drop_first=True)

In [15]:
final_Brand_dummy

Unnamed: 0,BMW,Bentley,Chevrolet,Datsun,Fiat,Ford,Honda,Hyundai,Isuzu,Jaguar,...,Mercedes-Benz,Mitsubishi,Nissan,Porsche,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1721,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1722,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
final_fuel_type_dummy=pd.get_dummies(car_dataset["fuel_type"],drop_first=True)

In [17]:
final_fuel_type_dummy

Unnamed: 0,Diesel,Hybrid,Petrol,Petrol + 1
0,0,0,1,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0
...,...,...,...,...
1720,0,0,1,0
1721,0,0,1,0
1722,1,0,0,0
1723,1,0,0,0


In [18]:
final_city_dummy=pd.get_dummies(car_dataset["city"],drop_first=True)

In [19]:
final_city_dummy

Unnamed: 0,Bangalore,Chennai,Dehradun,Delhi,Faridabad,Ghaziabad,Hyderabad,Lucknow,Mumbai,Navi Mumbai,Noida,Panchkula,Pune,Thane
0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1721,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1722,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1723,0,0,0,0,0,0,0,0,0,0,0,0,1,0


# Feature Concatenation:

X = pd.concat([final_fuel_type_dummy, final_city_dummy, final_Brand_dummy, x], axis=1): Combines the one-hot encoded variables and selected features (x) into a single DataFrame X for modeling.

In [20]:
X = pd.concat([final_fuel_type_dummy,final_city_dummy,final_Brand_dummy,x],axis=1)

In [21]:
X

Unnamed: 0,Diesel,Hybrid,Petrol,Petrol + 1,Bangalore,Chennai,Dehradun,Delhi,Faridabad,Ghaziabad,...,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo,year,distance_travelled(kms),brand_rank,car_age
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2016,9680.0,7,5.0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2012,119120.0,11,9.0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,2017,64593.0,1,4.0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2017,25000.0,2,4.0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2012,23800.0,14,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2015,38000.0,14,6.0
1721,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2011,36000.0,44,10.0
1722,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2008,142522.0,24,13.0
1723,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1990,18581.0,24,31.0


# Train-Test Split:

from sklearn.model_selection import train_test_split: Import the train_test_split function from scikit-learn.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1): Splits the dataset into training and testing sets with an 80-20 ratio

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
X_train, X_test, y_train , y_test= train_test_split(X,y ,test_size=0.20 , random_state=1)

# Linear Regression Model:

model = LinearRegression(): Creates a Linear Regression model.
    
model.fit(X_train, y_train): Fits (trains) the model using the training data.

In [25]:
from sklearn.linear_model import LinearRegression

In [26]:
model = LinearRegression()

In [27]:
model.fit(X_train,y_train)

LinearRegression()

# Predictions and Model Evaluation:

y_predict = model.predict(X_test): Makes predictions on the test data.

model.score(X_train, y_train): Computes the R-squared score on the training data.

model.score(X_test, y_test): Computes the R-squared score on the test data.

metrics.mean_absolute_error(y_test, y_predict): Calculates the mean absolute error between the actual and predicted car prices.

model.coef_ and model.intercept_: Retrieves the coefficients and intercept of the linear regression model.

In [28]:
y_preadict = model.predict(X_test)

In [29]:
y_preadict

array([1053637.55767769, 2551187.03015131, 1133444.93959734,
        -90360.97123811,  736462.91689804,  227358.78641376,
        623956.20782882,  125047.22546777,  875535.36064407,
       3288707.85656476,  779545.42195666,  578690.75556064,
        981063.76111716,  613139.87804157, 4947105.90525684,
       2222806.47699791, 1208779.55871916, 2620918.89444557,
       4024238.60013875,  626439.729478  , 4608632.82284576,
       -221427.37382826, 4626913.1386587 , 3756620.52236456,
       3261721.26638818,  506186.50647077, 2561892.87360412,
       3162049.90504336, 3304577.50274795, 1323625.39688212,
        616743.94932199, 1029788.52813119, 3355060.81338826,
        953496.60694438, 1767432.61519235, 3666848.6275481 ,
       1907105.84688002,  791577.90268898, 2747735.51061472,
       1464463.68800569,  396203.63957086, 2220263.7049301 ,
       1322760.32266328, 1436393.09641203, 2973276.52539837,
       3889173.84704566,  876873.66200033,   38400.22303995,
        159473.03980747,

In [30]:
model.score(X_train,y_train)

0.634342391459094

In [31]:
X_test

Unnamed: 0,Diesel,Hybrid,Petrol,Petrol + 1,Bangalore,Chennai,Dehradun,Delhi,Faridabad,Ghaziabad,...,Renault,Skoda,Tata,Toyota,Volkswagen,Volvo,year,distance_travelled(kms),brand_rank,car_age
969,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2016,66000.0,7,5.0
608,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,2012,103000.0,4,9.0
767,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,1,0,2017,30000.0,3,4.0
764,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2010,116594.0,7,11.0
592,0,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,2017,47265.0,40,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1430,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,2019,17000.0,14,2.0
1491,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2008,77000.0,14,13.0
1206,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,2012,48000.0,2,9.0
49,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2015,71000.0,4,6.0


In [32]:
y_test

969      775000.0
608     1850000.0
767      575000.0
764      650000.0
592      545000.0
          ...    
1430     995000.0
1491     190000.0
1206    1450000.0
49      2125000.0
94       425000.0
Name: price, Length: 345, dtype: float64

In [33]:
model.fit(X,y)

LinearRegression()

In [34]:
LinearRegression()

LinearRegression()

# Predictions and Model Evaluation:

y_predict = model.predict(X_test): Makes predictions on the test data.

model.score(X_train, y_train): Computes the R-squared score on the training data.

model.score(X_test, y_test): Computes the R-squared score on the test data.

metrics.mean_absolute_error(y_test, y_predict): Calculates the mean absolute error between the actual and predicted car prices.

model.coef_ and model.intercept_: Retrieves the coefficients and intercept of the linear regression model.

In [35]:
from sklearn import metrics

In [36]:
metrics.mean_absolute_error(y_test,y_preadict)

568726.8455944421

In [37]:
model.coef_

array([ 1.85558618e+05,  3.11075170e+05,  1.35256885e+05, -1.62789574e+05,
        5.08621125e+05,  5.06998764e+05,  6.95114114e+05,  9.67293877e+05,
        1.86031165e+06,  5.40451502e+05,  7.16260442e+05,  4.80747525e+05,
        3.10869370e+05,  8.92198032e+05,  3.48548272e+05, -3.70112375e+05,
        5.93232975e+05,  4.58651200e+05,  9.45005491e+05,  6.24137118e+06,
       -1.61182663e+06, -2.72554870e+06, -1.44131545e+06, -8.06363503e+05,
       -1.55204486e+06, -1.73083250e+06, -4.17524970e+05,  7.46383954e+05,
       -1.15015040e+06, -1.74665203e+06,  1.01012602e+07,  1.69338454e+06,
        1.15745230e+06, -2.29312693e+06,  1.51360385e+05, -1.64223639e+06,
       -1.02495655e+06, -2.11052747e+06,  1.06783759e+06, -1.44693935e+06,
       -1.56951974e+06,  2.87511193e+06, -2.03240367e+06, -1.53349735e+06,
       -2.10747774e+06, -5.26996826e+05, -1.60681921e+06,  1.15972923e+06,
        8.60628730e+04, -1.50635655e+00,  1.21835281e+04, -8.60628730e+04])

In [38]:
model.intercept_

-171341982.31527802

In [39]:
model.score(X_test,y_test)

0.6472355129120547

# Machine Learning Model Setup:
from sklearn.ensemble import RandomForestRegressor imports a machine learning model called RandomForestRegressor.

from sklearn.model_selection import train_test_split is used to split the data into training and testing sets.

reg = RandomForestRegressor() creates an instance of the RandomForestRegressor model.

X and y are defined as the input features and target variable, respectively.

X_train, X_test, y_train, and y_test are created by splitting the data into training and testing sets.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [None]:
reg = RandomForestRegressor()

# Model Training and Evaluation:
reg.fit(X_train, y_train) trains the RandomForestRegressor model. reg.score(X_train, y_train) calculates the model's accuracy on the training data. evaluate() is defined to evaluate the model's performance, calculating metrics like average error and accuracy.

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
reg.fit(X_train,y_train)

In [None]:
reg.score(X_train,y_train)

# Evaluating Model Performance:
evaluate(reg, X_train, y_train) and evaluate(reg, X_test, y_test) are called to evaluate the model's performance on the training and testing datasets, respectively.

In [None]:
def evaluate(model, X, y):
    predictions = model.predict(X)
    errors = abs(predictions - y)
    mape = 100 * np.mean(errors / y)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [None]:
# Evaluating the model performance

evaluate(reg,X_train,y_train)

In [None]:
evaluate(reg,X_test,y_test)

# Feature Importance:
feature_importance = reg.feature_importances_ calculates the importance of each feature in the model. The code then creates a DataFrame called paris_df to store the feature names and their importance scores.

In [None]:
feature_importance = reg.feature_importances_

In [None]:
for i in range(0,len(feature_importance),1):
    new_array = zip(X.columns,feature_importance)

In [None]:
car_dataset = pd.DataFrame(new_array)

# Sorting Feature Importance:
paris_df.sort_values(by=1, ascending=False) sorts the features by their importance score in descending order, helping identify which features are most important in predicting the target variable.

#Sorting feature importance

paris_df.sort_values(by=1,ascending=False)
#Sorting feature importance

In [None]:
#Sorting feature importance

car_dataset.sort_values(by=1,ascending=False)