In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler


In [3]:
# Load the dataset (assuming the file is named 'toyota_corolla.csv')
data = pd.read_csv('ToyotaCorolla - MLR.csv')  # Adjust the file path as needed
print(data.head())


   Price  Age_08_04     KM Fuel_Type  HP  Automatic    cc  Doors  Cylinders  \
0  13500         23  46986    Diesel  90          0  2000      3          4   
1  13750         23  72937    Diesel  90          0  2000      3          4   
2  13950         24  41711    Diesel  90          0  2000      3          4   
3  14950         26  48000    Diesel  90          0  2000      3          4   
4  13750         30  38500    Diesel  90          0  2000      3          4   

   Gears  Weight  
0      5    1165  
1      5    1165  
2      5    1165  
3      5    1165  
4      5    1170  


In [15]:
# Check for missing values
print(data.isnull().sum())


Price                  0
Age_08_04              0
KM                     0
HP                     0
Automatic           1436
cc                     0
Doors                  0
Cylinders              0
Gears                  0
Weight                 0
Fuel_Type_Diesel       0
Fuel_Type_Petrol       0
dtype: int64


In [22]:
# Drop rows where 'Automatic' column has missing values
data = data.dropna(subset=['Automatic'])

# Verify that missing values have been removed
print(data.isnull().sum())


Price               0
Age_08_04           0
KM                  0
HP                  0
Automatic           0
cc                  0
Doors               0
Cylinders           0
Gears               0
Weight              0
Fuel_Type_Diesel    0
Fuel_Type_Petrol    0
dtype: int64


In [23]:
# Define features (X) and target (y)
X = data.drop('Price', axis=1)  # All columns except 'Price'
y = data['Price']  # Target variable 'Price'


In [24]:
#Encode Categorical Variables
# One-hot encode the categorical columns
X = pd.get_dummies(X, drop_first=True)  # Drop the first category to avoid multicollinearity


In [26]:
# Check how many rows remain after dropping rows with missing values in the 'Automatic' column
print(f"Rows before dropping missing values: {len(data)}")

# Drop rows where 'Automatic' column has missing values
data = data.dropna(subset=['Automatic'])

# Check how many rows remain
print(f"Rows after dropping missing values: {len(data)}")

# If there are no rows left, this will cause issues in training the model.
if len(data) == 0:
    print("The dataset is empty after removing rows with missing 'Automatic' values.")


Rows before dropping missing values: 0
Rows after dropping missing values: 0
The dataset is empty after removing rows with missing 'Automatic' values.


In [28]:
# Define your features (X) and target variable (y)
X = data.drop('Price', axis=1)  # Features excluding 'Price'
y = data['Price']  # Target variable 'Price'


In [30]:
# Check the dataset before dropping missing values
print("Rows before dropping missing values: ", data.shape[0])

# Drop rows where 'Automatic' is missing
data_cleaned = data.dropna(subset=['Automatic'])

# Check the dataset after dropping missing values
print("Rows after dropping missing values: ", data_cleaned.shape[0])

# If there are still rows left, proceed with the next steps
if data_cleaned.shape[0] > 0:
    # Define your features (X) and target variable (y)
    X = data_cleaned.drop('Price', axis=1)  # Features excluding 'Price'
    y = data_cleaned['Price']  # Target variable 'Price'

    # Split the data into training and testing sets (80% training, 20% testing)
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Check the shapes of the resulting sets
    print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
else:
    print("The dataset is empty after dropping missing 'Automatic' values. Please check the data or consider filling missing values.")


Rows before dropping missing values:  0
Rows after dropping missing values:  0
The dataset is empty after dropping missing 'Automatic' values. Please check the data or consider filling missing values.


In [33]:
# Check the number of missing values in the 'Automatic' column before imputation
print(f"Missing values in 'Automatic' column: {data['Automatic'].isnull().sum()}")

# Check the shape of the data to see how many rows are there
print(f"Shape of the data before imputation: {data.shape}")


Missing values in 'Automatic' column: 0
Shape of the data before imputation: (0, 12)


In [34]:
# Check missing values in all columns
missing_values = data.isnull().sum()
print(missing_values)


Price               0
Age_08_04           0
KM                  0
HP                  0
Automatic           0
cc                  0
Doors               0
Cylinders           0
Gears               0
Weight              0
Fuel_Type_Diesel    0
Fuel_Type_Petrol    0
dtype: int64


In [37]:
# Separate features (X) and target (y)
X = data.drop(columns=['Price'])  # Drop the target column 'Price'
y = data['Price']  # The target column 'Price'


In [39]:
# Check if the data is loaded correctly
print(data.head())  # Print first few rows of the dataset
print(data.shape)   # Check the shape of the data (rows, columns)


Empty DataFrame
Columns: [Price, Age_08_04, KM, HP, Automatic, cc, Doors, Cylinders, Gears, Weight, Fuel_Type_Diesel, Fuel_Type_Petrol]
Index: []
(0, 12)


In [40]:
# Check if the data is loaded correctly
import pandas as pd

# Load the dataset
data = pd.read_csv('ToyotaCorolla - MLR.csv')  # Make sure the file path is correct

# Print the first few rows and the shape of the data
print(data.head())
print(data.shape)


   Price  Age_08_04     KM Fuel_Type  HP  Automatic    cc  Doors  Cylinders  \
0  13500         23  46986    Diesel  90          0  2000      3          4   
1  13750         23  72937    Diesel  90          0  2000      3          4   
2  13950         24  41711    Diesel  90          0  2000      3          4   
3  14950         26  48000    Diesel  90          0  2000      3          4   
4  13750         30  38500    Diesel  90          0  2000      3          4   

   Gears  Weight  
0      5    1165  
1      5    1165  
2      5    1165  
3      5    1165  
4      5    1170  
(1436, 11)


In [41]:
# One-hot encode 'Fuel_Type' and 'Automatic' columns
data_encoded = pd.get_dummies(data, columns=['Fuel_Type', 'Automatic'], drop_first=True)

# Check the encoded data
print(data_encoded.head())


   Price  Age_08_04     KM  HP    cc  Doors  Cylinders  Gears  Weight  \
0  13500         23  46986  90  2000      3          4      5    1165   
1  13750         23  72937  90  2000      3          4      5    1165   
2  13950         24  41711  90  2000      3          4      5    1165   
3  14950         26  48000  90  2000      3          4      5    1165   
4  13750         30  38500  90  2000      3          4      5    1170   

   Fuel_Type_Diesel  Fuel_Type_Petrol  Automatic_1  
0              True             False        False  
1              True             False        False  
2              True             False        False  
3              True             False        False  
4              True             False        False  


In [42]:
# Features (X) and target variable (y)
X = data_encoded.drop(columns=['Price'])
y = data_encoded['Price']

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting sets
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")


X_train shape: (1148, 11), X_test shape: (288, 11)
y_train shape: (1148,), y_test shape: (288,)


In [43]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 1165628.8132437498
R^2 Score: 0.912639791658401


In [44]:
# Feature importance
feature_importance = model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)
print(importance_df)


             Feature  Importance
0          Age_08_04    0.840480
7             Weight    0.065424
1                 KM    0.065313
2                 HP    0.015454
4              Doors    0.005330
3                 cc    0.003515
9   Fuel_Type_Petrol    0.001736
8   Fuel_Type_Diesel    0.001056
10       Automatic_1    0.000863
6              Gears    0.000829
5          Cylinders    0.000000


In [None]:
#From the feature importance analysis of the multiple linear regression model,conclusions:
#Age of the Car is the most significant predictor of the car's price, with a feature importance score of 0.840480. 
#This suggests that as a car gets older, its price tends to decrease, which aligns with common market trends where newer cars are typically priced higher than older ones.
#Weight, KM (kilometers driven), and HP (horsepower) are also notable factors influencing the price, with moderate importance scores of 0.065424, 0.065313, and 0.015454, respectively. Heavier cars, with more kilometers driven or higher horsepower, tend to have a significant impact on pricing, though less so than the age.
#Doors, cc (engine size), Gears, and Cylinders contribute relatively little to the price prediction. These features have low feature importance scores (0.005330, 0.003515, 0.000829, and 0.000000). This suggests that the number of doors or engine specifications like cc and Cylinders are not as influential in predicting car prices in this dataset.
#Fuel Type (Diesel vs. Petrol) has very low importance, with scores of 0.001736 for Petrol and 0.001056 for Diesel. This indicates that the type of fuel the car uses does not play a major role in determining the price in this dataset.
#The Automatic_1 feature, which represents whether the car is automatic, also has a very low feature importance (0.000863), indicating that this attribute has minimal influence on the price in this analysis.