<a href="https://colab.research.google.com/github/ujwalta/project/blob/main/MLproject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import joblib

# Load the dataset
car_data = pd.read_csv("car_price_prediction_.csv")
df = pd.DataFrame(car_data)
# Inspect the dataset
print(car_data.head())
print("Original Data Info:\n", df.info())
print(df.describe())
print("Original Data Sample:\n", df.head())

   Car ID  Brand  Year  Engine Size Fuel Type Transmission  Mileage Condition  \
0       1  Tesla  2016          2.3    Petrol       Manual   114832       New   
1       2    BMW  2018          4.4  Electric       Manual   143190      Used   
2       3   Audi  2013          4.5  Electric       Manual   181601       New   
3       4  Tesla  2011          4.1    Diesel    Automatic    68682       New   
4       5   Ford  2009          2.6    Diesel       Manual   223009  Like New   

      Price     Model  
0  26613.92   Model X  
1  14679.61  5 Series  
2  44402.61        A4  
3  86374.33   Model Y  
4  73577.10   Mustang  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car ID        2500 non-null   int64  
 1   Brand         2500 non-null   object 
 2   Year          2500 non-null   int64  
 3   Engine Size   2500 non-null   float64
 4  

In [12]:
# Rename columns for consistency
df.rename(columns=lambda x: x.strip().replace(" ", "_").lower(), inplace=True)

# Check for missing values and fill them
print("Missing Values:\n", df.isnull().sum())
df.ffill(inplace=True)

# Drop the 'Car ID' column
df = df.drop(columns=['car_id'])

# Verify the column is removed
print(df.head())

Missing Values:
 car_id          0
brand           0
year            0
engine_size     0
fuel_type       0
transmission    0
mileage         0
condition       0
price           0
model           0
dtype: int64
   brand  year  engine_size fuel_type transmission  mileage condition  \
0  Tesla  2016          2.3    Petrol       Manual   114832       New   
1    BMW  2018          4.4  Electric       Manual   143190      Used   
2   Audi  2013          4.5  Electric       Manual   181601       New   
3  Tesla  2011          4.1    Diesel    Automatic    68682       New   
4   Ford  2009          2.6    Diesel       Manual   223009  Like New   

      price     model  
0  26613.92   Model X  
1  14679.61  5 Series  
2  44402.61        A4  
3  86374.33   Model Y  
4  73577.10   Mustang  


In [13]:
# Remove duplicates
print("Duplicates: ", df.duplicated().sum())
df.drop_duplicates(inplace=True)

# Standardize categorical data
df['fuel_type'] = df['fuel_type'].str.strip().str.title()
df['transmission'] = df['transmission'].str.strip().str.capitalize()

# Remove outliers in the 'price' column
df = df[(df['price'] > 1000) & (df['price'] < 1000000)]

# Verify the cleaned data
print("Cleaned Data Info:\n", df.info())
print("Cleaned Data Sample:\n", df.head())

# Save the cleaned dataset
df.to_csv("cleaned_dataset.csv", index=False)

Duplicates:  0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         2500 non-null   object 
 1   year          2500 non-null   int64  
 2   engine_size   2500 non-null   float64
 3   fuel_type     2500 non-null   object 
 4   transmission  2500 non-null   object 
 5   mileage       2500 non-null   int64  
 6   condition     2500 non-null   object 
 7   price         2500 non-null   float64
 8   model         2500 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 175.9+ KB
Cleaned Data Info:
 None
Cleaned Data Sample:
    brand  year  engine_size fuel_type transmission  mileage condition  \
0  Tesla  2016          2.3    Petrol       Manual   114832       New   
1    BMW  2018          4.4  Electric       Manual   143190      Used   
2   Audi  2013          4.5  Electric       Manual   181601       New   
3  

In [15]:
current_year = 2025
df['car_age'] = current_year - df['year']

# Define features and target variable
X = df[['car_age']]
y = df['price']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Save the trained model
joblib.dump(model, 'car_price_predictor.pkl')

print("Model trained and saved successfully!")

Model trained and saved successfully!
