# Importing necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np

# Load the datasets

In [2]:
train_df = pd.read_csv('/kaggle/input/kagglex-dataset/train.csv')
test_df = pd.read_csv('/kaggle/input/kagglex-dataset/test.csv')

# Display the first few rows of the training data

In [3]:
print(train_df.head())

   id    brand          model  model_year  milage fuel_type  \
0   0     Ford   F-150 Lariat        2018   74349  Gasoline   
1   1      BMW          335 i        2007   80000  Gasoline   
2   2   Jaguar      XF Luxury        2009   91491  Gasoline   
3   3      BMW   X7 xDrive40i        2022    2437    Hybrid   
4   4  Pontiac  Firebird Base        2001  111000  Gasoline   

                                              engine  \
0      375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel   
1  300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
2       300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel   
3  335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
4      200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col       accident clean_title  \
0                    10-Speed A/T    Blue    Gray  None reported         Yes   
1                     6-Speed M/T   Black   Black  None reported         Yes   
2                     6-Speed A/T  Purple   

# Check for missing values in the training data


In [4]:
missing_values = train_df.isnull().sum()
print(f'Missing values:\n{missing_values}')


Missing values:
id              0
brand           0
model           0
model_year      0
milage          0
fuel_type       0
engine          0
transmission    0
ext_col         0
int_col         0
accident        0
clean_title     0
price           0
dtype: int64


# Define the features and target

In [5]:
X = train_df.drop(columns=['id', 'price'])
y = train_df['price']

# Preprocessing for numerical data

In [6]:
numerical_features = ['model_year', 'milage']
numerical_transformer = StandardScaler()

# Preprocessing for categorical data

In [7]:
categorical_features = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident']
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Bundle preprocessing for numerical and categorical data

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define a function to evaluate a model

In [9]:
def evaluate_model(model, X, y):
    pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='neg_mean_squared_error')
    rmse_scores = np.sqrt(-cv_scores)
    print(f'RMSE scores: {rmse_scores}')
    print(f'Mean RMSE: {np.mean(rmse_scores)}')
    return np.mean(rmse_scores)

# Evaluate Random Forest Regressor

In [10]:
rf_rmse = evaluate_model(RandomForestRegressor(n_estimators=100, random_state=0), X, y)


RMSE scores: [77922.48638102 79020.32304205 55058.66716704 85122.12026808
 69563.13081331]
Mean RMSE: 73337.3455343


# Evaluate Gradient Boosting Regressor

In [11]:
print("Gradient Boosting Regressor:")
gb_rmse = evaluate_model(GradientBoostingRegressor(n_estimators=100, random_state=0), X, y)


Gradient Boosting Regressor:
RMSE scores: [74225.34177786 75412.58181266 51901.50063245 82354.74282158
 68259.7543952 ]
Mean RMSE: 70430.78428794902


# Evaluate Linear Regression

In [12]:
print("Linear Regression:")
lr_rmse = evaluate_model(LinearRegression(), X, y)

Linear Regression:
RMSE scores: [81289.64799216 76749.62550847 74183.37805587 83149.63186402
 76370.30979162]
Mean RMSE: 78348.51864242798


# Compare RMSE scores

In [13]:
print(f'Random Forest RMSE: {rf_rmse}')
print(f'Gradient Boosting RMSE: {gb_rmse}')
print(f'Linear Regression RMSE: {lr_rmse}')

Random Forest RMSE: 73337.3455343
Gradient Boosting RMSE: 70430.78428794902
Linear Regression RMSE: 78348.51864242798


# Perform hyperparameter tuning for the best model (Random Forest in this case)

In [14]:
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5]
}

In [15]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', RandomForestRegressor(random_state=0))])


In [16]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

# Print the best parameters and best RMSE score

In [17]:
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best RMSE: {np.sqrt(-grid_search.best_score_)}')

Best parameters: {'model__max_depth': 10, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best RMSE: 72336.02296823014


# Fit the best model on the entire training data

In [18]:
best_model = grid_search.best_estimator_
best_model.fit(X, y)

# Predict on the test data

In [19]:
test_X = test_df.drop(columns=['id'])
test_preds = best_model.predict(test_X)

# Create a submission file

In [20]:
submission = pd.DataFrame({'id': test_df['id'], 'price': test_preds})
submission.to_csv('submission.csv', index=False)


In [21]:
print('Submission file created successfully.')

Submission file created successfully.
