In [None]:
import pandas as pd
import re
import ast
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import KNNImputer
import pickle
import numpy as np

# Load data from multiple Excel files
cities = ['bangalore', 'chennai', 'delhi', 'hyderabad', 'jaipur', 'kolkata']
dfs = []

for city in cities:
    df = pd.read_excel(f"C:\\Users\\surej\\OneDrive\\Desktop\\project\\cardekhoproject\\{city}_cars.xlsx")
    df['City'] = city.capitalize()
    dfs.append(df)

all_cars_df = pd.concat(dfs, ignore_index=True)

# Improved data preprocessing for kilometers
def clean_kilometers(km_str):
    if isinstance(km_str, str):
        km_str = km_str.replace(' kms', '').replace(',', '').strip()
    return pd.to_numeric(km_str, errors='coerce')

# Feature extraction functions
def extract_engine_capacity(specs):
    for section in specs.get('data', []):
        if section.get('subHeading') == 'Engine':
            for item in section.get('list', []):
                if item.get('key') == 'Displacement':
                    return item.get('value', '')
    return '' 

def extract_year(make_year):
    match = re.search(r'\d{4}', make_year)
    return match.group(0) if match else make_year

def convert_price(price_str):
    price_str = price_str.replace('₹', '').replace(',', '').strip().lower()

    if 'lakh' in price_str:
        return float(re.findall(r'\d+\.?\d*', price_str)[0]) * 1e5
    elif 'crore' in price_str:
        return float(re.findall(r'\d+\.?\d*', price_str)[0]) * 1e7
    else:
        return float(re.findall(r'\d+\.?\d*', price_str)[0])

def extract_features(row):
    new_car_detail = ast.literal_eval(row['new_car_detail'])
    new_car_overview = ast.literal_eval(row['new_car_overview'])
    new_car_specs = ast.literal_eval(row['new_car_specs'])

    make_year = new_car_overview['top'][0].get('value', '') if new_car_overview.get('top') else ''

    extracted_data = {
        'fuel_type': new_car_detail.get('ft', ''),
        'body_type': new_car_detail.get('bt', ''),
        'kilometers': new_car_detail.get('km', '').replace(' kms', '').replace(',', ''),
        'transmission': new_car_detail.get('transmission', ''),
        'engine_capacity': extract_engine_capacity(new_car_specs),
        'make_year': extract_year(make_year),
        'owner_type': new_car_detail.get('ownerNo', 0),  
        'price': convert_price(new_car_detail.get('price', ''))
    }
    return extracted_data


structured_data = all_cars_df.apply(extract_features, axis=1, result_type='expand')
structured_data['City'] = all_cars_df['City']

# Handling missing values in make_year
structured_data['make_year'] = pd.to_numeric(structured_data['make_year'], errors='coerce')
default_year = structured_data['make_year'].median()  
structured_data['make_year'].fillna(default_year, inplace=True)
structured_data['make_year'] = structured_data['make_year'].astype(int)

#print(structured_data.info()) 
#print(structured_data.dtypes)

# Check for missing values
#print("Missing values before handling:")
#print(structured_data.isnull().sum())

structured_data['kilometers'] = structured_data['kilometers'].apply(clean_kilometers)

# Handle missing values with KNN imputation
imputer = KNNImputer(n_neighbors=5)
structured_data['kilometers'] = imputer.fit_transform(structured_data[['kilometers']])

# Identify and cap outliers for kilometers using the 1.5*IQR rule
Q1 = structured_data['kilometers'].quantile(0.25)
Q3 = structured_data['kilometers'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

structured_data['kilometers'] = np.where(structured_data['kilometers'] > upper_bound, upper_bound, 
                                         np.where(structured_data['kilometers'] < lower_bound, lower_bound, structured_data['kilometers']))

# Identify and cap outliers for price using the 1.5*IQR rule
Q1_price = structured_data['price'].quantile(0.25)
Q3_price = structured_data['price'].quantile(0.75)
IQR_price = Q3_price - Q1_price
lower_bound_price = Q1_price - 1.5 * IQR_price
upper_bound_price = Q3_price + 1.5 * IQR_price

structured_data['price'] = np.where(structured_data['price'] > upper_bound_price, upper_bound_price, 
                                    np.where(structured_data['price'] < lower_bound_price, lower_bound_price, structured_data['price']))

# One-hot encoding
#structured_data = pd.get_dummies(structured_data, columns=['City'], drop_first=True)

# Encode categorical variables
categorical_columns = structured_data.select_dtypes(include=['object']).columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}

for column in categorical_columns:
    structured_data[column] = label_encoders[column].fit_transform(structured_data[column])

# Normalize numerical features
scaler = MinMaxScaler()
structured_data[['kilometers', 'engine_capacity']] = scaler.fit_transform(structured_data[['kilometers', 'engine_capacity']])

# Create additional features
#structured_data['log_kilometers'] = np.log1p(structured_data['kilometers'])

# Exploratory Data Analysis (EDA)
# Descriptive Statistics
#print(structured_data.describe())

# Data Visualization
plt.figure(figsize=(10, 6))
sns.histplot(structured_data['price'], bins=30, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(10, 6))
sns.boxplot(x='price', data=structured_data)
plt.title('Price Boxplot')
plt.xlabel('Price')
plt.show()

plt.figure(figsize=(12, 8))
numeric_data = structured_data.select_dtypes(include=[np.number])
# Check for missing values before correlation matrix
#print("Missing values in numeric data:")
#print(numeric_data.isnull().sum())
# Fill missing values
numeric_data = numeric_data.fillna(numeric_data.mean())
correlation_matrix = numeric_data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

# Feature Selection: Check feature importance from RandomForest
X = structured_data.drop(columns=['price'])
y = structured_data['price']
#rf_model = RandomForestRegressor(n_estimators=10, random_state=42)
#rf_model.fit(X, y)
#feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
#print("Feature Importances:")
#print(feature_importances)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model training and optimization
rf_model = RandomForestRegressor()
param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=4)  # Reduced n_jobs to 4
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
y_pred_best_rf = best_rf_model.predict(X_test)

#print("Optimized Random Forest MAE:", mean_absolute_error(y_test, y_pred_best_rf))
#print("Optimized Random Forest MSE:", mean_squared_error(y_test, y_pred_best_rf))
#print("Random Forest R-squared:", r2_score(y_test, y_pred_best_rf))

# Save the model and the encoders/scalers
with open('best_rf_model.pkl', 'wb') as f:
    pickle.dump(best_rf_model, f)

with open('label_encoders.pkl', 'wb') as f:
    pickle.dump(label_encoders, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)


In [None]:
%%writefile cardekho.py
import streamlit as st
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Load the trained model and encoders/scalers
with open('best_rf_model.pkl', 'rb') as f:
    model = pickle.load(f)

with open('label_encoders.pkl', 'rb') as f:
    label_encoders = pickle.load(f)

with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

# Function to safely encode categorical inputs
def safe_transform(encoder, value):
    if value in encoder.classes_:
        return encoder.transform([value])[0]
    else:
        encoder.classes_ = np.append(encoder.classes_, value)
        return encoder.transform([value])[0]

# Map owner types to numeric values
owner_type_mapping = {'First': 1, 'Second': 2, 'Third': 3, 'Fourth & Above': 4}

# Streamlit app for prediction
st.title("Used Car Price Predictor")
st.header("Enter the details of the car:")


# Input fields
fuel_type = st.selectbox('Fuel Type', ['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric'])
body_type = st.selectbox('Body Type', ['Hatchback', 'Sedan', 'SUV', 'MPV', 'Convertible'])
kilometers = st.number_input('Kilometers Driven', min_value=0)
transmission = st.selectbox('Transmission', ['Manual', 'Automatic'])
engine_capacity = st.number_input('Engine Capacity (cc)', min_value=0)
make_year = st.number_input('Make Year', min_value=1950, max_value=2024)
owner_type = st.selectbox('Owner Type', ['First', 'Second', 'Third', 'Fourth & Above'])
city = st.selectbox('City', ['Delhi', 'Mumbai', 'Bangalore', 'Chennai', 'Hyderabad'])

# Encode and scale input data using the encoders and scalers from the training phase
input_data = pd.DataFrame({
    'fuel_type': [safe_transform(label_encoders['fuel_type'], fuel_type)],
    'body_type': [safe_transform(label_encoders['body_type'], body_type)],
    'kilometers': [kilometers],
    'transmission': [safe_transform(label_encoders['transmission'], transmission)],
    'engine_capacity': [engine_capacity],
    'make_year': [make_year],
    'owner_type': [owner_type_mapping[owner_type]],
    'City': [safe_transform(label_encoders['City'], city)]
})

# Create the log_kilometers feature
#input_data['log_kilometers'] = np.log1p(input_data['kilometers'])

# Normalize the numerical features
input_data[['kilometers', 'engine_capacity']] = scaler.transform(input_data[['kilometers', 'engine_capacity']])

# Make predictions
if st.button("Predict Price"):
    prediction = model.predict(input_data)
    st.success(f"The estimated price of the car is: ₹{round(prediction[0], 2)}")


In [None]:
!streamlit run cardekho.py