# Data Preprocessing

In [None]:
import numpy as np
import pandas as pd
import statistics as st

from google.colab import drive
drive.mount('/content/drive')

# Path to your file in Google Drive
file_path = '/content/drive/My Drive/Car Dataset ML Project/car_data1.csv'

# Read the CSV file using pandas
import pandas as pd
df = pd.read_csv(file_path)
df.head()

In [None]:
df.describe

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna()
df.shape

In [None]:
valueskeep=['MANUAL','AUTOMATIC']
df = df[df['Transmission'].isin(valueskeep)]
df['Transmission'].unique()


In [None]:
np.sort(df['Car Condition'].unique())

In [None]:
np.sort(df['Year'].unique())

In [None]:
np.sort(df['Kilometers Driven'].unique())

In [None]:
np.sort(df['Selling Price'].unique())

In [None]:
df.head()

In [None]:
df=df.drop('sno', axis=1)
df.head()

In [None]:
df.shape

In [None]:
df['Owner'] = df['Owner'].replace('First Owner', '1')
df['Owner'] = df['Owner'].replace('Second Owner', '2')
df['Owner'] = df['Owner'].replace('Third Owner', '3')
df.head()

In [None]:
df['Model'].unique()

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Model'] = le.fit_transform(df['Model'])
df.head()

In [None]:
from collections import Counter
Counter(df['Model'])

In [None]:
df.info()

In [None]:
df['Fuel Type'].unique()

In [None]:
df['Fuel Type'] = le.fit_transform(df['Fuel Type'])
df['Transmission'] = le.fit_transform(df['Transmission'])
df.head()


In [None]:
df = df.drop('Insurance expired', axis=1)

In [None]:
df.to_csv('modified_data.csv', index=False)

In [None]:
df.describe



---



# Let's start with Applying ML models
Linear Regression



---



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Assuming your data is stored in a DataFrame named 'df'

# Separating features and target variable
X = df.drop('Selling Price', axis=1)
y = df['Selling Price']

# Splitting the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
lrmse = mean_squared_error(y_test, y_pred)
lrr2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", lrmse)
print("R-squared score:", lrr2)


Mean Squared Error (MSE): 23438645459.60009
R-squared score: 0.5059594559404886


# Now let's use Support Vector Machines

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Assuming X_train, X_test, y_train, y_test are your training and test data

# Create and fit the Support Vector Regression model
svr = SVR(kernel='linear')  # You can choose different kernels like 'linear', 'poly', etc.
svr.fit(X_train, y_train)

# Predict on the test set
svr_predictions = svr.predict(X_test)

# Evaluate the model
svrmse = mean_squared_error(y_test, svr_predictions)
svrrmse = np.sqrt(svrmse)
svrr2 = r2_score(y_test, svr_predictions)


print(f"Mean Squared Error: {svrmse}")
print(f"Root Mean Squared Error: {svrrmse}")
print(f"R-squared Score: {svrr2}")


Mean Squared Error: 75731391659.64008
Root Mean Squared Error: 275193.37139480683
R-squared Score: -0.5962687776647178


# Using Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Assuming 'X' contains your features and 'y' contains your target variable

# Splitting the data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the Decision Tree Regression model
model = DecisionTreeRegressor(random_state=42)

# Training the model
model.fit(X_train, y_train)

# Making predictions on the test set
predictions = model.predict(X_test)

# Evaluating the model
dtmse = mean_squared_error(y_test, predictions)
dtr2 = r2_score(y_test, predictions)

print(f"Mean Squared Error (MSE): {dtmse}")
print(f"R-squared score: {dtr2}")


Mean Squared Error (MSE): 20961773414.846153
R-squared score: 0.55816704680427


# Conclusion after using 3 ML models for the Car Dataset

So, here we have used 3 ML models for the same dataset, Car Dataset. Let's compare the accuracy.

In [None]:
print("\n\nLinear Regression:")
print(f"Mean Squared Error (MSE): {lrmse}")
print(f"R-squared score: {lrr2}")

print("\n\nSupport Vector Machine:")
print(f"Mean Squared Error (MSE): {svmmse}")
print(f"R-squared score: {svmr2}")

print("\n\nDecision Tree:")
print(f"Mean Squared Error (MSE): {dtmse}")
print(f"R-squared score: {dtr2}")



Linear Regression:
Mean Squared Error (MSE): 23438645459.60009
R-squared score: 0.5059594559404886


Support Vector Machine:
Mean Squared Error (MSE): 51378151529.21221
R-squared score: -0.08295037688995799


Decision Tree:
Mean Squared Error (MSE): 20961773414.846153
R-squared score: 0.55816704680427


# Observations:

The R-squared score measures the goodness of fit of a model, where higher values (closer to 1) indicate a better fit. The MSE represents the average squared difference between the predicted values and the actual values. Lower MSE values indicate better accuracy.

Based on these scores:

The **Decision Tree model** appears to have the lowest MSE and a relatively higher R-squared score among the three models, indicating better performance compared to Linear Regression and Support Vector Machine in this specific scenario.

In [None]:
model=DecisionTreeRegressor()

# Deployment

In [None]:
df.head()

Unnamed: 0,Model,Selling Price,Kilometers Driven,Year,Owner,Fuel Type,Transmission,Car Condition,current price
0,250,312165.0,82238.0,2014.0,1,2,1,4.2,1374415.0
1,320,313799.0,30558.0,2013.0,1,1,1,4.4,1072043.0
2,172,295999.0,22164.0,2018.0,1,1,1,4.8,1351931.0
3,226,435199.0,30535.0,2013.0,1,0,1,4.3,840305.0
4,249,289099.0,15738.0,2013.0,1,1,1,4.3,861879.0


In [None]:

import pickle
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

#import warnings
#warnings.filterwarnings("ignore")

df=pd.read_csv('modified_data.csv')

# Assuming 'X' contains your features and 'y' contains your target variable
X = df.drop('Selling Price', axis=1)
y = df['Selling Price']


df = np.array(df)

y = y.astype('int')
X = X.astype('int')
# print(X,y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

model = DecisionTreeRegressor(random_state=42)

# Training the model
model.fit(X_train, y_train)

pickle.dump(model,open('model.pkl','wb'))


Here is the App code:

#StreamLit Code

In [None]:
import streamlit as st
import pickle
import numpy as np

# Load the trained model
model = pickle.load(open('model.pkl', 'rb'))

# Function to predict selling price
def predict_selling_price(features):
    features = np.array(features).reshape(1, -1)
    prediction = model.predict(features)
    return prediction[0]

def main():
    st.title('Car Selling Price Prediction')

    # Sidebar with input fields
    st.sidebar.header('Enter Car Details')
    model = st.sidebar.selectbox('Model', options=['Model1', 'Model2', 'Model3'])  # Add your model choices
    kilometers_driven = st.sidebar.number_input('Kilometers Driven')
    year = st.sidebar.number_input('Year')
    owner = st.sidebar.selectbox('Owner', options=['First Owner', 'Second Owner', 'Third Owner'])  # Add owner options
    fuel_type = st.sidebar.selectbox('Fuel Type', options=['Petrol', 'Diesel', 'CNG'])  # Add fuel type options
    transmission = st.sidebar.selectbox('Transmission', options=['Manual', 'Automatic'])  # Add transmission options
    car_condition = st.sidebar.number_input('Car Condition')  # Adjust the range as needed
    current_price = st.sidebar.number_input('Current Price')

    # Prepare features for prediction
    features = [model, kilometers_driven, year, owner, fuel_type, transmission, car_condition, current_price]

    # Make prediction
    if st.sidebar.button('Predict'):
        prediction = predict_selling_price(features)
        st.success(f'Predicted Selling Price: ${prediction:.2f}')

if __name__ == "__main__":
    main()


In [None]:
!streamlit run app.py --server.port 8501



Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.30.9.131:8501[0m
[0m


In [None]:
!pip install pyngrok==5.0.0



In [None]:
from pyngrok import ngrok

# Setup a tunnel to the Streamlit port 8501
public_url = ngrok.connect(port='8501')
public_url


<NgrokTunnel: "http://cb1d-34-30-9-131.ngrok.io" -> "http://localhost:80">

In [None]:
from pyngrok import ngrok

# Get the list of active tunnels
tunnels = ngrok.get_tunnels()

# Iterate through the tunnels and terminate them
for tunnel in tunnels:
    ngrok.disconnect(tunnel.public_url)
