In [54]:
5. Prepare the Data for Machine Learning modelling

In [29]:
import pandas as pd
# pandas is aliased as pd
import numpy as np
# numpy is aliased as np
import matplotlib.pyplot as pt
# pyplot is aliased as plt
from sklearn.preprocessing import StandardScaler

In [64]:
# Check for missing values
car_data.isnull().sum()

name             0
year             0
selling_price    0
km_driven        0
fuel             0
seller_type      0
transmission     0
owner            0
dtype: int64

In [65]:
# Option 1: Drop rows with missing values
car_data.dropna(inplace=True)

In [39]:
# Load the dataset
import pandas as pd
car_data = pd.read_csv('CAR DETAILS (2).csv') 

In [38]:
car_data.head()


Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [40]:
# Option 1: Drop rows with missing values
car_data.dropna(inplace=True)

In [41]:
# Fill missing values only for numeric columns using the mean
car_data.fillna(car_data.select_dtypes(include=['number']).mean(), inplace=True)

In [42]:
# Step 1: Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [43]:
# Step 2: Load the dataset
file_path = 'CAR DETAILS (2).csv' 
df = pd.read_csv('CAR DETAILS (2).CSV')

In [44]:
# Step 3: One-hot encoding for categorical variables
df_encoded = pd.get_dummies(df, columns=['fuel', 'seller_type', 'transmission', 'owner'], drop_first=True)

In [45]:
# Step 4: Define features (X) and target (y)
X = df_encoded.drop(columns=['selling_price', 'name'])  # 'name' is dropped as it is not useful for ML modeling
y = df_encoded['selling_price']

In [46]:
# Step 5: Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# Step 6: Scale numerical features using StandardScaler
scaler = StandardScaler()

In [48]:
# Apply scaling to 'year' and 'km_driven' (keeping only numerical columns)
X_train[['year', 'km_driven']] = scaler.fit_transform(X_train[['year', 'km_driven']])
X_test[['year', 'km_driven']] = scaler.transform(X_test[['year', 'km_driven']])

In [49]:
# Step 7: Verify the preprocessed data
X_train.head(), X_test.head()

(          year  km_driven  fuel_Diesel  fuel_Electric  fuel_LPG  fuel_Petrol  \
 227   0.923421  -0.981140         True          False     False        False   
 964   1.161228  -0.341463         True          False     False        False   
 2045 -0.027808  -0.874527        False          False     False         True   
 1025 -0.503423   0.084989         True          False     False        False   
 4242  0.923421   0.127634         True          False     False        False   
 
       seller_type_Individual  seller_type_Trustmark Dealer  \
 227                     True                         False   
 964                     True                         False   
 2045                    True                         False   
 1025                    True                         False   
 4242                   False                         False   
 
       transmission_Manual  owner_Fourth & Above Owner  owner_Second Owner  \
 227                  True                       False

In [50]:
# Assuming 'df_encoded' is your dataset with encoded features and 'selling_price' is the target variable
X = df_encoded.drop(columns=['selling_price'])  # Features
y = df_encoded['selling_price']  # Target

In [51]:
# Step 1: Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [52]:
# Step 2: Scale numerical features using StandardScaler
scaler = StandardScaler()
X_train[['year', 'km_driven']] = scaler.fit_transform(X_train[['year', 'km_driven']])
X_test[['year', 'km_driven']] = scaler.transform(X_test[['year', 'km_driven']])

In [53]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor

# Define the models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "Lasso Regression": Lasso(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
}

In [63]:

# Step 1: Drop the 'name' column if it's not needed for training
X_train = X_train.drop(columns=['name'])
X_test = X_test.drop(columns=['name'])

# Step 2: One-hot encode categorical features before training
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

# Step 3: Ensure that train and test have the same columns
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Initialize an empty dictionary for storing results
results = {}

# Step 4: Train and evaluate the models
for model_name, model in models.items():
    print(f"Training model: {model_name}")
    
    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    # Evaluate the model
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)

    # Store the results
    results[model_name] = {
        "Train MSE": mse_train,
        "Test MSE": mse_test,
        "Train R^2": r2_train,
        "Test R^2": r2_test
    }

# Results comparison
results_df = pd.DataFrame(results).T
print(results_df)


Training model: Linear Regression
Training model: Random Forest
Training model: Gradient Boosting
                      Train MSE      Test MSE  Train R^2  Test R^2
Linear Regression  1.843466e+11  1.849635e+11   0.460885  0.393901
Random Forest      2.592294e+10  1.547570e+11   0.924189  0.492883
Gradient Boosting  8.395802e+10  1.577834e+11   0.754468  0.482966


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Calculate MSE
mse_train = mean_squared_error(y_train, y_pred_train)
mse_test = mean_squared_error(y_test, y_pred_test)

# Calculate R²
r2_train = r2_score(y_train, y_pred_train)
r2_test = r2_score(y_test, y_pred_test)

# Save results
results[model_name] = {
    "Train MSE": mse_train,
    "Test MSE": mse_test,
    "Train R^2": r2_train,
    "Test R^2": r2_test
}



In [None]:
# Step 5: Compare the results
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
# Step 6: Find the best model based on Test R^2 score
best_model = results_df['Test R^2'].idxmax()
print(f"Best model: {best_model}")
print(results_df.columns)

In [None]:
6. Apply coding on various Machine Learning techniques such as Regression or 
classification ,Bagging, Ensemble techniques and find out the
best model using various Machine Learning model evaluation  for jupyter notebook

In [None]:
1. Load and Clean the Data

In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
df = pd.read_csv('CAR DETAILS (2).csv')

# View the first few rows
df.head()

# Check for null values
df.isnull().sum()

# Drop or fill missing values
df.dropna(inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
df['name'] = label_encoder.fit_transform(df['name'])
df['fuel'] = label_encoder.fit_transform(df['fuel'])
df['seller_type'] = label_encoder.fit_transform(df['seller_type'])
df['transmission'] = label_encoder.fit_transform(df['transmission'])
df['owner'] = label_encoder.fit_transform(df['owner'])

# Feature scaling
scaler = StandardScaler()
df[['km_driven']] = scaler.fit_transform(df[['km_driven']])

# Separate features and target
X = df.drop(columns=['selling_price'])
y = df['selling_price']

In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load the dataset
df = pd.read_csv('CAR DETAILS (2).csv')

# View the first few rows
df.head()

# Check for null values
df.isnull().sum()

# Drop or fill missing values
df.dropna(inplace=True)

# Encode categorical variables
label_encoder = LabelEncoder()
df['name'] = label_encoder.fit_transform(df['name'])
df['fuel'] = label_encoder.fit_transform(df['fuel'])
df['seller_type'] = label_encoder.fit_transform(df['seller_type'])
df['transmission'] = label_encoder.fit_transform(df['transmission'])
df['owner'] = label_encoder.fit_transform(df['owner'])

# Feature scaling
scaler = StandardScaler()
df[['km_driven']] = scaler.fit_transform(df[['km_driven']])

# Separate features and target
X = df.drop(columns=['selling_price'])
y = df['selling_price']

In [58]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

# Dictionary to store model results
results = {}

# Train and evaluate models
for model_name, model in models.items():
    print(f"Training model: {model_name}")
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Evaluate the model
    mse_train = mean_squared_error(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_train = r2_score(y_train, y_pred_train)
    r2_test = r2_score(y_test, y_pred_test)
    
    # Save results
    results[model_name] = {'MSE Train': mse_train, 'MSE Test': mse_test, 'R2 Train': r2_train, 'R2 Test': r2_test}

# View the results
pd.DataFrame(results).T


Training model: Linear Regression
Training model: Random Forest
Training model: Gradient Boosting


Unnamed: 0,MSE Train,MSE Test,R2 Train,R2 Test
Linear Regression,184152600000.0,184332100000.0,0.461453,0.39597
Random Forest,10866770000.0,95937210000.0,0.968221,0.685627
Gradient Boosting,47498620000.0,107945500000.0,0.861092,0.646278


In [60]:
from sklearn.ensemble import BaggingRegressor

from sklearn.ensemble import BaggingRegressor, RandomForestRegressor

# Bagging with Random Forest
bagging_model = BaggingRegressor(estimator=RandomForestRegressor(), n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)

# Predictions and evaluation


# Predictions and evaluation
y_pred_bagging = bagging_model.predict(X_test)
mse_bagging = mean_squared_error(y_test, y_pred_bagging)
r2_bagging = r2_score(y_test, y_pred_bagging)

print(f"Bagging MSE: {mse_bagging}, R2 Score: {r2_bagging}")

Bagging MSE: 99004528733.27812, R2 Score: 0.6755761184893123


In [61]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boosting
gbr_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gbr_model.fit(X_train, y_train)

# Predictions and evaluation
y_pred_gbr = gbr_model.predict(X_test)
mse_gbr = mean_squared_error(y_test, y_pred_gbr)
r2_gbr = r2_score(y_test, y_pred_gbr)

print(f"Gradient Boosting MSE: {mse_gbr}, R2 Score: {r2_gbr}")


Gradient Boosting MSE: 107950372397.86275, R2 Score: 0.6462618501201238


In [None]:
7. Save the best model and Load the model

In [None]:
# Check the columns of the dataset
print(data.columns)


In [None]:
For Scikit-learn:
Saving the Model:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor  # Example model
import joblib

# Load the dataset
data = pd.read_csv('CAR DETAILS (2).csv')

# Separate features and target (assuming 'price' is the target variable)
X = data.drop(columns=['Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')']
y = data['1'] 

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a model (Random Forest as an example)
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Save the trained model
joblib.dump(model, 'best_model.pkl')

# Optionally, load the saved model to confirm it works
loaded_model = joblib.load('best_model.pkl')


In [None]:
import joblib

# Assuming 'model' is your trained model
joblib.dump(model, 'best_model.pkl')


In [None]:
Loading the Model:

In [None]:
# Load the saved model
model = joblib.load('best_model.pkl')


In [None]:
For TensorFlow/Keras:
Saving the Model:

In [None]:
# Assuming 'model' is your trained model
model.save('best_model.h5')


In [None]:
Loading the Model:

In [None]:
from tensorflow.keras.models import load_model

# Load the saved model
model = load_model('best_model.h5')


In [None]:
8. Randomly pick 20 data points from the dataset and test the model

In [None]:
# Check the columns of the dataset
print(data.columns)


In [None]:
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

# Load the dataset
data = pd.read_csv('CAR DETAILS (2).csv')

# Separate features and target (assuming 'selling_price' is the target variable)
X = data.drop(columns=['selling_price'])  # Features
y = data['selling_price']  # Target (selling_price)

# Preprocessing for categorical data
categorical_features = ['name', 'fuel', 'seller_type', 'transmission', 'owner']
numeric_features = ['year', 'km_driven']

# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ], remainder='passthrough')  # Remainder will pass through numeric columns 'year' and 'km_driven'

# Create a pipeline that first applies the preprocessor, then fits the model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

# Save the trained model
joblib.dump(model_pipeline, 'best_model.pkl')

# Load the saved model to test
loaded_model = joblib.load('best_model.pkl')

# Test the model on the test set
predictions = loaded_model.predict(X_test)

# Print out the first 5 predictions for comparison
print("Sample predictions:", predictions[:5])
print("Actual prices:", y_test[:5].values)
