In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import the data
df = pd.read_csv(r"B:\EDNA\ML in PBI\car data.csv")
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB


In [4]:
from ydata_profiling import ProfileReport

In [5]:
profile = ProfileReport(df, title="Car Data Profiling Report", explorative=True)
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 9/9 [00:00<00:00, 363.50it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
# Data Cleaning
print(df.columns)

Index(['Car_Name', 'Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
       'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner'],
      dtype='object')


In [7]:
#drop the car name column
df.drop(columns=['Car_Name'], inplace=True)

In [8]:
df.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [9]:
# Create a dummy variable from our categorical features
df = pd.get_dummies(df, columns=['Fuel_Type'])


In [10]:
df.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Seller_Type,Transmission,Owner,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol
0,2014,3.35,5.59,27000,Dealer,Manual,0,False,False,True
1,2013,4.75,9.54,43000,Dealer,Manual,0,False,True,False
2,2017,7.25,9.85,6900,Dealer,Manual,0,False,False,True
3,2011,2.85,4.15,5200,Dealer,Manual,0,False,False,True
4,2014,4.6,6.87,42450,Dealer,Manual,0,False,True,False


In [11]:
# Convert the dummy variables to integer type
dummy_cols = [col for col in df.columns if col.startswith('Fuel_Type_')]
df[dummy_cols] = df[dummy_cols].astype(int)
df.head()


Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Seller_Type,Transmission,Owner,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol
0,2014,3.35,5.59,27000,Dealer,Manual,0,0,0,1
1,2013,4.75,9.54,43000,Dealer,Manual,0,0,1,0
2,2017,7.25,9.85,6900,Dealer,Manual,0,0,0,1
3,2011,2.85,4.15,5200,Dealer,Manual,0,0,0,1
4,2014,4.6,6.87,42450,Dealer,Manual,0,0,1,0


In [13]:
categorical_cols = ['Seller_Type', 'Transmission']

df = pd.get_dummies(df, columns=categorical_cols, drop_first=False)

# Convert only the new dummy columns to 1/0
dummy_cols = [col for col in df.columns if any(col.startswith(c + '_') for c in categorical_cols)]
df[dummy_cols] = df[dummy_cols].astype(int)
df.head()

Unnamed: 0,Year,Selling_Price,Present_Price,Kms_Driven,Owner,Fuel_Type_CNG,Fuel_Type_Diesel,Fuel_Type_Petrol,Seller_Type_Dealer,Seller_Type_Individual,Transmission_Automatic,Transmission_Manual
0,2014,3.35,5.59,27000,0,0,0,1,1,0,0,1
1,2013,4.75,9.54,43000,0,0,1,0,1,0,0,1
2,2017,7.25,9.85,6900,0,0,0,1,1,0,0,1
3,2011,2.85,4.15,5200,0,0,0,1,1,0,0,1
4,2014,4.6,6.87,42450,0,0,1,0,1,0,0,1


In [14]:
# Model Building
x = df.drop(columns=['Selling_Price'])
y = df['Selling_Price']

from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [22]:
# linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
# Evaluate the model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"R2 Score: {r2}")



MAE: 1.2162256821303006
MSE: 3.4788039706475047
R2 Score: 0.8489813024894898


In [23]:
prediction_percentage = r2 * 100
print(f"Prediction Percentage: {prediction_percentage:.2f}%")


Prediction Percentage: 84.90%


In [None]:
# Cross Validation
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model, x, y, cv=5)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Average CV Score: {np.mean(cv_scores)}")


Cross-Validation Scores: [  0.87081422   0.78495686 -93.94826392   0.59365768   0.82692649]
Average CV Score: -18.17438173226645


In [25]:
# Save the model in a pickle file
import pickle
with open('car_price_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [29]:
# comparison of actual vs predicted
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
comparison_df.head(10)
plt.figure(figsize=(10,6))
sns.scatterplot(x='Actual', y='Predicted', data=comparison_df)
plt.plot([comparison_df['Actual'].min(), comparison_df['Actual'].max()], [comparison_df['Actual'].min(), comparison_df['Actual'].max()], 'r--')
plt.xlabel('Actual Selling Price')
plt.ylabel('Predicted Selling Price')
plt.title('Actual vs Predicted Selling Price')
plt.savefig("kde_plot.png", dpi=300, bbox_inches='tight')

In [36]:
# auto ml approcah using h2o
!pip install h2o

Collecting h2o
  Downloading h2o-3.46.0.8-py2.py3-none-any.whl.metadata (2.1 kB)
Downloading h2o-3.46.0.8-py2.py3-none-any.whl (266.0 MB)
   ---------------------------------------- 0.0/266.0 MB ? eta -:--:--
   ---------------------------------------- 1.3/266.0 MB 7.3 MB/s eta 0:00:37
   ---------------------------------------- 2.9/266.0 MB 7.6 MB/s eta 0:00:35
    --------------------------------------- 4.7/266.0 MB 7.9 MB/s eta 0:00:33
    --------------------------------------- 6.3/266.0 MB 8.2 MB/s eta 0:00:32
   - -------------------------------------- 8.1/266.0 MB 8.0 MB/s eta 0:00:33
   - -------------------------------------- 9.7/266.0 MB 7.9 MB/s eta 0:00:33
   - -------------------------------------- 11.8/266.0 MB 8.5 MB/s eta 0:00:30
   -- ------------------------------------- 14.4/266.0 MB 8.9 MB/s eta 0:00:29
   -- ------------------------------------- 17.6/266.0 MB 9.7 MB/s eta 0:00:26
   --- ------------------------------------ 21.0/266.0 MB 10.4 MB/s eta 0:00:24
   ---

In [40]:
import h2o

h2o.init()
hf = h2o.H2OFrame(df)
hf['Selling_Price'] = hf['Selling_Price'].asfactor()
aml = h2o.automl.H2OAutoML(max_models=20, seed=1, max_runtime_secs=300)
aml.train(y='Selling_Price', training_frame=hf)
# View the AutoML Leaderboard
lb = aml.leaderboard
lb.head(rows=lb.nrows)  # Print all rows instead of default 10 rows

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...


H2OStartupError: Cannot find Java. Please install the latest JRE from
http://docs.h2o.ai/h2o/latest-stable/h2o-docs/welcome.html#java-requirements