In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import SelectKBest, f_regression

In [5]:
# Load dataset
print("Loading dataset...")
df = pd.read_csv('C:/Users/bhard/OneDrive/Desktop/CAR PRICE PRICTION WITH MACHINE LEARNING/cardata.csv')

Loading dataset...


In [6]:
# Display the first few rows and column names of the dataset
print("\nDataset loaded successfully. Here are the first few rows:")
print(df.head())
print("\nColumns in the dataset:")
print(df.columns)


Dataset loaded successfully. Here are the first few rows:
  Car_Name  Year  Selling_Price  Present_Price  Driven_kms Fuel_Type  \
0     ritz  2014           3.35           5.59       27000    Petrol   
1      sx4  2013           4.75           9.54       43000    Diesel   
2     ciaz  2017           7.25           9.85        6900    Petrol   
3  wagon r  2011           2.85           4.15        5200    Petrol   
4    swift  2014           4.60           6.87       42450    Diesel   

  Selling_type Transmission  Owner  
0       Dealer       Manual      0  
1       Dealer       Manual      0  
2       Dealer       Manual      0  
3       Dealer       Manual      0  
4       Dealer       Manual      0  

Columns in the dataset:
Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Driven_kms',
       'Fuel_Type', 'Selling_type', 'Transmission', 'Owner'],
      dtype='object')


In [7]:
# Handle missing values
print("\nHandling missing values...")
df = df.dropna()
print(f"Number of rows after removing missing values: {df.shape[0]}")


Handling missing values...
Number of rows after removing missing values: 301


In [8]:
# Convert categorical features to numerical
print("\nConverting categorical features...")
df = pd.get_dummies(df, columns=['Car_Name', 'Fuel_Type', 'Selling_type', 'Transmission'], drop_first=True)
print(f"Columns after encoding categorical variables: {df.columns}")



Converting categorical features...
Columns after encoding categorical variables: Index(['Year', 'Selling_Price', 'Present_Price', 'Driven_kms', 'Owner',
       'Car_Name_Activa 3g', 'Car_Name_Activa 4g', 'Car_Name_Bajaj  ct 100',
       'Car_Name_Bajaj Avenger 150', 'Car_Name_Bajaj Avenger 150 street',
       ...
       'Car_Name_swift', 'Car_Name_sx4', 'Car_Name_verna',
       'Car_Name_vitara brezza', 'Car_Name_wagon r', 'Car_Name_xcent',
       'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'Selling_type_Individual',
       'Transmission_Manual'],
      dtype='object', length=106)


In [9]:
# Log transformation to handle extreme values
print("\nApplying log transformation to skewed features...")
df['Driven_kms'] = np.log1p(df['Driven_kms'])
df['Present_Price'] = np.log1p(df['Present_Price'])


Applying log transformation to skewed features...


In [10]:
# Display summary statistics
print("\nDisplaying summary statistics:")
print(df.describe())


Displaying summary statistics:
              Year  Selling_Price  Present_Price  Driven_kms       Owner
count   301.000000     301.000000     301.000000  301.000000  301.000000
mean   2013.627907       4.661296       1.771675   10.118771    0.043189
std       2.891554       5.082812       0.900768    1.013392    0.247915
min    2003.000000       0.100000       0.277632    6.216606    0.000000
25%    2012.000000       0.900000       0.788457    9.615872    0.000000
50%    2014.000000       3.600000       2.001480   10.373522    0.000000
75%    2016.000000       6.000000       2.388763   10.794830    0.000000
max    2018.000000      35.000000       4.539030   13.122365    3.000000


In [11]:
# Define features and target variable
target_column = 'Selling_Price'
X = df.drop(target_column, axis=1)
y = df[target_column]

In [12]:
# Check for extreme values
print("\nChecking for extreme values...")
print(f"Max and Min values in features: {X.max().max()} and {X.min().min()} respectively")


Checking for extreme values...
Max and Min values in features: 2018 and 0 respectively


In [13]:
# Feature Selection
print("\nSelecting top features...")
selector = SelectKBest(score_func=f_regression, k=10)
X_new = selector.fit_transform(X, y)
print(f"Selected features indices: {selector.get_support(indices=True)}")


Selecting top features...
Selected features indices: [  0   1  72  84  89  91 101 102 103 104]


In [14]:
# Scale features
print("\nScaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_new)


Scaling features...


In [15]:
# Split data into training and testing sets
print("\nSplitting data into training and testing sets...")
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


Splitting data into training and testing sets...


In [16]:
# Initialize and train the model
print("\nTraining the model...")
model = LinearRegression()
model.fit(X_train, y_train)


Training the model...


In [17]:
# Cross-validation
print("\nPerforming cross-validation...")
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='r2')
print(f"Cross-validation R² scores: {cv_scores}")
print(f"Average cross-validation R² score: {cv_scores.mean():.2f}")


Performing cross-validation...
Cross-validation R² scores: [-2.70208601e+26  7.01478942e-01 -5.35505955e+01  5.43131318e-01
  7.61229143e-01]
Average cross-validation R² score: -54041720119526279536967680.00


In [18]:
# Make predictions
print("\nMaking predictions...")
y_pred = model.predict(X_test)


Making predictions...


In [19]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'\nModel Evaluation:')
print(f'  Mean Squared Error (MSE): {mse:.2e}')
print(f'  R-squared (R²): {r2:.2f}')



Model Evaluation:
  Mean Squared Error (MSE): 3.11e+00
  R-squared (R²): 0.86
