In [1]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.head(), test_df.head()


(   id    brand          model  model_year  milage fuel_type  \
 0   0     Ford   F-150 Lariat        2018   74349  Gasoline   
 1   1      BMW          335 i        2007   80000  Gasoline   
 2   2   Jaguar      XF Luxury        2009   91491  Gasoline   
 3   3      BMW   X7 xDrive40i        2022    2437    Hybrid   
 4   4  Pontiac  Firebird Base        2001  111000  Gasoline   
 
                                               engine  \
 0      375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel   
 1  300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
 2       300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel   
 3  335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...   
 4      200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel   
 
                      transmission ext_col int_col       accident clean_title  \
 0                    10-Speed A/T    Blue    Gray  None reported         Yes   
 1                     6-Speed M/T   Black   Black  None reported         Yes   
 2                     6-Sp

In [2]:
# Ensure all values in 'milage' are strings before using .str.replace
train_df['milage'] = train_df['milage'].astype(str).str.replace(',', '').astype(float)
test_df['milage'] = test_df['milage'].astype(str).str.replace(',', '').astype(float)

# Extract engine size and horsepower from the train dataset
train_df['horsepower'] = train_df['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)
train_df['engine_size'] = train_df['engine'].str.extract(r'(\d+\.?\d*)L').astype(float)

# Extract engine size and horsepower from the test dataset
test_df['horsepower'] = test_df['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)
test_df['engine_size'] = test_df['engine'].str.extract(r'(\d+\.?\d*)L').astype(float)

# Drop the original engine column
train_df = train_df.drop(columns=['engine'])
test_df = test_df.drop(columns=['engine'])

# Drop irrelevant columns
train_df = train_df.drop(columns=['id', 'model'])
test_ids = test_df['id']
test_df = test_df.drop(columns=['id', 'model'])

# Handle missing values by filling with the median for numerical columns
train_df['horsepower'] = train_df['horsepower'].fillna(train_df['horsepower'].median())
train_df['engine_size'] = train_df['engine_size'].fillna(train_df['engine_size'].median())
test_df['horsepower'] = test_df['horsepower'].fillna(test_df['horsepower'].median())
test_df['engine_size'] = test_df['engine_size'].fillna(test_df['engine_size'].median())

# Fill missing values for categorical columns with 'Unknown'
train_df = train_df.fillna('Unknown')
test_df = test_df.fillna('Unknown')


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Separate features and target variable from the train dataset
X_train = train_df.drop(columns=['price'])
y_train = train_df['price']
X_test = test_df

# Define categorical and numerical features
categorical_features = ['brand', 'fuel_type', 'transmission', 'ext_col', 'int_col', 'accident', 'clean_title']
numerical_features = ['model_year', 'milage', 'horsepower', 'engine_size']

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit and transform the training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)


In [4]:
from sklearn.linear_model import LinearRegression

# Define the model
model = LinearRegression()

# Train the model
model.fit(X_train_preprocessed, y_train)


In [5]:
# Make predictions on the test set
y_pred = model.predict(X_test_preprocessed)


In [6]:
# Create the submission dataframe
submission_df = pd.DataFrame({'id': test_ids, 'price': y_pred})

# Save the submission dataframe to a CSV file
submission_df.to_csv('submission1.csv', index=False)


In [8]:
submission_df.head()

Unnamed: 0,id,price
0,54273,35315.922147
1,54274,24005.733085
2,54275,40026.133022
3,54276,65436.809726
4,54277,33276.262277
