In [1]:
# Load and inspect the data

import pandas as pd

# Load the data
train = pd.read_csv('/kaggle/input/kagglexfellowship/kagglex-cohort4/train.csv')
test = pd.read_csv('/kaggle/input/kagglexfellowship/kagglex-cohort4/test.csv')
submission = pd.read_csv('/kaggle/input/kagglexfellowship/kagglex-cohort4/sample_submission.csv')

# Inspect the data
print(train.head())
print(train.info())
print(train.describe())

print(test.head())
print(test.info())


   id    brand          model  model_year  milage fuel_type  \
0   0     Ford   F-150 Lariat        2018   74349  Gasoline   
1   1      BMW          335 i        2007   80000  Gasoline   
2   2   Jaguar      XF Luxury        2009   91491  Gasoline   
3   3      BMW   X7 xDrive40i        2022    2437    Hybrid   
4   4  Pontiac  Firebird Base        2001  111000  Gasoline   

                                              engine  \
0      375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel   
1  300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
2       300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel   
3  335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
4      200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel   

                     transmission ext_col int_col       accident clean_title  \
0                    10-Speed A/T    Blue    Gray  None reported         Yes   
1                     6-Speed M/T   Black   Black  None reported         Yes   
2                     6-Speed A/T  Purple   

In [2]:
# Preprocess the data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Handle missing values
# Fill numerical columns with median
numerical_columns = ['milage', 'model_year']
train[numerical_columns] = train[numerical_columns].fillna(train[numerical_columns].median())
test[numerical_columns] = test[numerical_columns].fillna(test[numerical_columns].median())

# Fill categorical columns with mode
categorical_columns = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
for col in categorical_columns:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(test[col].mode()[0])

# Encode categorical variables
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    # Handle new categories in the test set
    test[col] = test[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    label_encoders[col] = le

# Scale numerical features
scaler = StandardScaler()
train[numerical_columns] = scaler.fit_transform(train[numerical_columns])
test[numerical_columns] = scaler.transform(test[numerical_columns])

# Inspect the processed data
print(train.head())
print(test.head())
# Step 2: Preprocess the data

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Handle missing values
# Fill numerical columns with median
numerical_columns = ['milage', 'model_year']
train[numerical_columns] = train[numerical_columns].fillna(train[numerical_columns].median())
test[numerical_columns] = test[numerical_columns].fillna(test[numerical_columns].median())

# Fill categorical columns with mode
categorical_columns = ['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
for col in categorical_columns:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(test[col].mode()[0])

# Encode categorical variables
label_encoders = {}

for col in categorical_columns:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    # Handle new categories in the test set
    test[col] = test[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    label_encoders[col] = le

# Scale numerical features
scaler = StandardScaler()
train[numerical_columns] = scaler.fit_transform(train[numerical_columns])
test[numerical_columns] = scaler.transform(test[numerical_columns])

# Inspect the processed data
print(train.head())
print(test.head())


   id  brand  model  model_year    milage  fuel_type  engine  transmission  \
0   0     14    644    0.520325  0.031759          2     719             2   
1   1      4     49   -1.447877  0.143728          2     534            18   
2   2     21   1771   -1.090022  0.371412          2     541            14   
3   3      4   1748    1.236035 -1.393115          3     646            43   
4   4     40    693   -2.521442  0.757966          2     219            34   

   ext_col  int_col  accident  clean_title  price  
0       26       57         1            0  11000  
1       17        9         1            0   8250  
2      181        6         1            0  15000  
3      100       24         1            0  63500  
4      249        9         1            0   7850  
      id  brand  model  model_year    milage  fuel_type  engine  transmission  \
0  54273     35    538   -0.195385  0.005029          2     554            34   
1  54274     26   1267   -0.016458  1.095441          2  

In [3]:
# Feature extraction/engineering
# For simplicity, we'll use all available features

X = train.drop(['id', 'price'], axis=1)
y = train['price']

# Step 4: Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(43418, 11) (10855, 11) (43418,) (10855,)


In [4]:
# Train machine learning models

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Train a Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Evaluate the model
y_pred_train = lr.predict(X_train)
y_pred_val = lr.predict(X_val)

train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
val_rmse = mean_squared_error(y_val, y_pred_val, squared=False)

print(f"Train RMSE: {train_rmse}")
print(f"Validation RMSE: {val_rmse}")


Train RMSE: 73694.28138358072
Validation RMSE: 49542.71139412612


In [5]:
from sklearn.ensemble import RandomForestRegressor

# Train a Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the model
y_pred_train_rf = rf.predict(X_train)
y_pred_val_rf = rf.predict(X_val)

train_rmse_rf = mean_squared_error(y_train, y_pred_train_rf, squared=False)
val_rmse_rf = mean_squared_error(y_val, y_pred_val_rf, squared=False)

print(f"Train RMSE (Random Forest): {train_rmse_rf}")
print(f"Validation RMSE (Random Forest): {val_rmse_rf}")


Train RMSE (Random Forest): 29297.84918150249
Validation RMSE (Random Forest): 55226.03931104438


In [6]:
# Make predictions on the test set

X_test = test.drop('id', axis=1)
test_predictions = rf.predict(X_test)

# Prepare the submission file
submission['price'] = test_predictions

# Save the submission file to a writable directory
submission.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file created.")

Submission file created.


In [7]:
# Load the submission file (assuming it's already saved)
submission = pd.read_csv('/kaggle/working/submission.csv')

# Print the first few rows to preview the data
print(submission.head())

      id     price
0  54273  20881.79
1  54274  16300.40
2  54275  37119.78
3  54276  73221.59
4  54277  36781.59


In [8]:

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Load data
train = pd.read_csv('/kaggle/input/kagglexfellowship/kagglex-cohort4/train.csv')
test = pd.read_csv('/kaggle/input/kagglexfellowship/kagglex-cohort4/test.csv')
submission = pd.read_csv('/kaggle/input/kagglexfellowship/kagglex-cohort4/sample_submission.csv')


# Handle missing values
train.fillna(train.median(numeric_only=True), inplace=True)
test.fillna(test.median(numeric_only=True), inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_columns = train.select_dtypes(include=['object']).columns

# Combine train and test for consistent encoding
combined = pd.concat([train[categorical_columns], test[categorical_columns]])

for col in categorical_columns:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))
    train[col] = combined[:len(train)][col]
    test[col] = combined[len(train):][col]
    label_encoders[col] = le

# Feature scaling
scaler = StandardScaler()
numerical_columns = train.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop('price')  # Exclude the target column
train[numerical_columns] = scaler.fit_transform(train[numerical_columns])
test[numerical_columns] = scaler.transform(test[numerical_columns])

# Define features and target
X = train.drop(columns=['id', 'price'])
y = train['price']
X_test = test.drop(columns=['id'])

# Split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training with RandomForest and GradientBoosting
models = {
    "RandomForest": RandomForestRegressor(random_state=42),
    "GradientBoosting": GradientBoostingRegressor(random_state=42)
}

param_grid = {
    "RandomForest": {
        'n_estimators': [100, 200],
        'max_depth': [10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },
    "GradientBoosting": {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
}

best_models = {}
for name, model in models.items():
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid[name], cv=3, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    y_pred = grid_search.predict(X_valid)
    print(f"RMSE for {name}: {np.sqrt(mean_squared_error(y_valid, y_pred))}")

# Select the best model based on validation performance
best_model_name = min(best_models, key=lambda name: np.sqrt(mean_squared_error(y_valid, best_models[name].predict(X_valid))))
best_model = best_models[best_model_name]

# Predictions on the test set
test_predictions = best_model.predict(X_test)

# Prepare the submission file
submission['price'] = test_predictions
submission.to_csv('/kaggle/working/submission.csv', index=False)

print("Submission file created.")


Best parameters for GradientBoosting: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
RMSE for GradientBoosting: 50652.85623101362
Submission file created.


In [9]:
# Load the submission file (assuming it's already saved)
submission = pd.read_csv('/kaggle/working/submission.csv')

# Print the first few rows to preview the data
print(submission.head())

      id         price
0  54273  24159.786448
1  54274  16913.192502
2  54275  26027.316504
3  54276  63415.809827
4  54277  37134.737733
