In [1]:
import torch

print("Is CUDA available: ", torch.cuda.is_available())
print("CUDA device count: ", torch.cuda.device_count())
print("CUDA device name: ", torch.cuda.get_device_name(0))
torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.cuda.empty_cache()

Is CUDA available:  True
CUDA device count:  1
CUDA device name:  NVIDIA GeForce GTX 1050


In [None]:
import pandas as pd
import background
import os

def combine_excel_files(folder_path):
    dataframes = []
    first_df_columns = None
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)
            df = pd.read_excel(file_path)
            df = background.structured_data(df)
            first_df_columns = df.columns.tolist()
            break
    
    if not first_df_columns:
        return pd.DataFrame()
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.xlsx'):
            file_path = os.path.join(folder_path, filename)
            
            df = pd.read_excel(file_path)
            df = background.structured_data(df)
            
            city_name = os.path.splitext(filename)[0]
            df['city'] = city_name
            
            for col in first_df_columns + ['city']:
                if col not in df.columns:
                    raise ValueError(f"Column {col} missing in file {filename}")
            
            df = df[first_df_columns + ['city']]
            
            df.reset_index(drop=True, inplace=True)
            
            dataframes.append(df)
    
    if dataframes:
        all_data = pd.concat(dataframes, ignore_index=True)
        return all_data
    else:
        return pd.DataFrame()

folder_path = 'dataset'
result = combine_excel_files(folder_path)
duplicate_columns = result.columns[result.columns.duplicated()].tolist()

if duplicate_columns:
    result = result.loc[:, ~result.columns.duplicated()]

In [None]:
unique_info = {}

for col in result.columns:
    unique_info[col] = {
        'unique_count': result[col].nunique(),
        'unique_values': result[col].unique().tolist()
    }
print("columns droped are")
for col, info in unique_info.items():
    if info['unique_count'] == 1:    
        print(f"Column: {col}")
        result.drop([col],axis=1,inplace=True)
        print()

In [None]:
def check_missing_values(df, threshold_percentage=0.1):
    columns_to_drop = set()
        
    nan_counts = df.isnull().mean() 
    
    columns_to_drop.update(nan_counts[nan_counts > threshold_percentage].index)
    
    empty_str_count = (df == '').mean() 
    
    columns_to_drop.update(empty_str_count[empty_str_count > threshold_percentage].index)
    
    space_only_count = (df.applymap(lambda x: isinstance(x, str) and x.isspace())).mean()
    
    columns_to_drop.update(space_only_count[space_only_count > threshold_percentage].index)
    
    problematic_values = ['N/A', 'NA', 'null', 'NULL']
    for value in problematic_values:
        problematic_count = (df == value).mean()
        
        columns_to_drop.update(problematic_count[problematic_count > threshold_percentage].index)
    
    df.drop(columns=columns_to_drop, inplace=True)
    print("\nColumns dropped:", columns_to_drop)
    return df
result = check_missing_values(result)

In [None]:
result.drop(['oem','misc_Seating Capacity','ft','kms_driven','year_of_manufacture','ownership','owner','engine','engine_displacement','registration_year','engine_Max Power','engine_Max Torque'],axis=1,inplace=True)

In [None]:
def convert_price(price):
    price = price.replace('₹', '').replace(',', '').strip()
    
    if 'Lakh' in price:
        value = float(price.replace('Lakh', '').strip())
        return value * 100000 
    elif 'Crore' in price:
        value = float(price.replace('Crore', '').strip())
        return value * 10000000 
    else:
        return float(price)

df = result['price'].apply(convert_price)
result.drop('price',axis=1,inplace=True)
result=pd.concat([result,df],axis=1)


In [None]:
result = result.dropna()
result['seats'] = result['seats'].str.extract('(\d+)').astype(int)
result['mileage'] = result['mileage'].str.replace(',', '').str.extract('(\d+\.\d+|\d+)').astype(float)
result['max_power'] = result['max_power'].str.replace(',', '').str.extract('(\d+\.\d+|\d+)').astype(float)
result['torque'] = result['torque'].str.replace(',', '').str.extract('(\d+\.\d+|\d+)').astype(float)
result['dimension_Length'] = result['dimension_Length'].str.extract('(\d+)').astype(int)
result['dimension_Width'] = result['dimension_Width'].str.extract('(\d+)').astype(int)
result['dimension_Height'] = result['dimension_Height'].str.extract('(\d+)').astype(int)
result['dimension_Wheel Base'] = result['dimension_Wheel Base'].str.extract('(\d+)').astype(int)
result['dimension_Kerb Weight'] = result['dimension_Kerb Weight'].str.extract('(\d+)').astype(int)

In [None]:
result['km'] = result['km'].str.replace(',', '').str.strip().astype(int)
result['engine_Displacement'] = result['engine_Displacement'].astype(int) 
result['misc_No Door Numbers'] = result['misc_No Door Numbers'].astype(int) 

In [None]:
result.to_csv('fulldata.csv',index=False)

In [None]:
df1=pd.read_csv('fulldata.csv')

In [None]:
df1=df1[['transmission','modelYear','misc_Gear Box','city','insurance_validity','ownerNo','km','bt','mileage','fuel_type','price']]

In [None]:
top_features=['Transmission','Model_Year','Gear_Box','city','Insurance_Validity','No_Of_Owners','km_driven','Body_Type','Mileage','Fuel_Type']

In [None]:
my_list = top_features

with open('car_dheko_app/model/my_list.txt', 'w') as f:
    for item in my_list:
        f.write(f"{item}\n")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

target_column = 'price'

columns_to_plot = [col for col in df1.columns if col != target_column]

plt.figure(figsize=(15, len(columns_to_plot) * 4))

for i, col in enumerate(columns_to_plot, 1):
    plt.subplot(len(columns_to_plot), 1, i)
    sns.boxplot(data=df1, y=col)
    plt.title(f"Box Plot of {col}")

plt.tight_layout()
plt.show()


In [None]:
df1=df1[df1['mileage']<df1['mileage'].max()-1]

In [None]:
df1=df1[df1['km']<df1['km'].max()-1]

In [None]:
Q1 = df1['modelYear'].quantile(0.25)
Q3 = df1['modelYear'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df1 = df1[(df1['modelYear'] >= lower_bound) & (df1['modelYear'] <= upper_bound)]

In [None]:
df1.describe()

In [None]:
list1=df1['transmission'].unique()
with open('car_dheko_app/model/transmission.txt', 'w') as f:
    for item in list1:
        f.write(f"{item}\n")
list1=df1['misc_Gear Box'].unique()
with open('car_dheko_app/model/Gear_Box.txt', 'w') as f:
    for item in list1:
        f.write(f"{item}\n")
list1=df1['city'].unique()
with open('car_dheko_app/model/city.txt', 'w') as f:
    for item in list1:
        f.write(f"{item}\n")
list1=df1['bt'].unique()
with open('car_dheko_app/model/bodytype.txt', 'w') as f:
    for item in list1:
        f.write(f"{item}\n")
list1=df1['insurance_validity'].unique()
with open('car_dheko_app/model/insurance.txt', 'w') as f:
    for item in list1:
        f.write(f"{item}\n")
list1=df1['fuel_type'].unique()
with open('car_dheko_app/model/fuel.txt', 'w') as f:
    for item in list1:
        f.write(f"{item}\n")

In [None]:
numerical = df1.select_dtypes(include=['number'])
categorical = df1.select_dtypes(exclude=['number'])

In [None]:
from sklearn.preprocessing import LabelEncoder
import joblib

label_encoders = {}
df2 = categorical

for column in df2.columns:
    label_encoder = LabelEncoder()
    df2[column] = label_encoder.fit_transform(categorical[column])
    
    label_encoders[column] = label_encoder

joblib.dump(label_encoders, 'car_dheko_app/model/label_encoders.pkl')

In [None]:
df=pd.DataFrame()
df=pd.concat([df2,numerical],axis=1)
df=df[['transmission','modelYear','misc_Gear Box','city','insurance_validity','ownerNo','km','bt','mileage','fuel_type','price']]
df

In [None]:
df['price']=df['price']/100000
df.reset_index(drop=True, inplace=True)

In [None]:
from pycaret.regression import * 

target_column = 'price'

reg_setup = setup(data=df, target=target_column, preprocess=False, verbose=False)

best_model = compare_models()

print("Best model based on RMSE:")
print(best_model)

In [None]:
plot_model(best_model, plot='residuals') 

In [None]:
# Retrieve and print the best model's parameters directly
best_model_params = best_model.get_params()
print("Parameters used for the best model:")
print(best_model_params)

In [None]:
from sklearn.model_selection import train_test_split
y=df['price']
df.drop('price',axis=1,inplace=True)
X=df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

param_grid = {
    'n_estimators': [50,100,150,200],         
    'max_depth': [None, 10, 20,30],       
    'min_samples_split': [2,5,6,7],   
    'min_samples_leaf': [1, 2, 4],           
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]           
}


grid_search = GridSearchCV(estimator=best_model, param_grid=param_grid, n_jobs=-1, verbose=0, scoring='r2')

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best R² Score:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
r2_score_final = r2_score(y_test, y_pred)

In [None]:
import pickle

with open('car_dheko_app/model/model.pkl', 'wb') as file:
    pickle.dump(best_model, file)