In [2]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_regression
from sklearn.metrics import mean_squared_error

from kagglex_cohort4 import *

from xgboost import XGBRegressor

In [3]:
url = "train.csv"
raw = pd.read_csv(url, engine="pyarrow", dtype_backend="pyarrow")
cars = clean_housing(raw)

In [4]:
target = "price"
features = [col for col in cars.columns if col not in target]

X = cars[features]
y = cars[target]

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (54273, 12)
y shape: (54273,)


In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=43)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

X_train shape: (40704, 12)
y_train shape: (40704,)
X_val shape: (13569, 12)
y_val shape: (13569,)


In [6]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40704 entries, 11257 to 14148
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   id            40704 non-null  uint16[pyarrow]
 1   brand         40704 non-null  category       
 2   model         40704 non-null  category       
 3   model_year    40704 non-null  uint16[pyarrow]
 4   milage        40704 non-null  uint32[pyarrow]
 5   fuel_type     40704 non-null  category       
 6   engine        40704 non-null  category       
 7   transmission  40704 non-null  category       
 8   ext_col       40704 non-null  category       
 9   int_col       40704 non-null  category       
 10  accident      40704 non-null  category       
 11  clean_title   40704 non-null  category       
dtypes: category(9), uint16[pyarrow](2), uint32[pyarrow](1)
memory usage: 1.3 MB


In [7]:
cat = list(X.select_dtypes("category").columns)
cat

['brand',
 'model',
 'fuel_type',
 'engine',
 'transmission',
 'ext_col',
 'int_col',
 'accident',
 'clean_title']

In [8]:
num = list(X.select_dtypes("number").columns)
num

['id', 'model_year', 'milage']

In [9]:
# Create the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(use_cat_names=True), cat),
        ('num', StandardScaler(), num)
    ]
)

In [10]:
lr = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression()),
    
])

In [11]:
lr.fit(X_train, y_train)
# lr.score(X_test, y_test)

In [12]:
# Access the linear regression model within the pipeline
linear_model = lr.named_steps['regressor']
print("Coefficients:", linear_model.coef_)

Coefficients: [-6.88607247e+13 -6.88607247e+13 -6.88607247e+13 ... -2.83093750e+02
  5.03433203e+03 -7.31867188e+03]


In [13]:
# Get the feature names after transformation
ohe = lr.named_steps['preprocessor'].named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(cat)
num_feature_names = num
feature_names = list(ohe_feature_names) + num_feature_names

In [14]:
pd.Series(linear_model.coef_, index=feature_names)

brand_Ford                                        -6.886072e+13
brand_BMW                                         -6.886072e+13
brand_Volvo                                       -6.886072e+13
brand_Nissan                                      -6.886072e+13
brand_Cadillac                                    -6.886072e+13
                                                       ...     
accident_At least 1 accident or damage reported    1.767492e+15
clean_title_Yes                                    0.000000e+00
id                                                -2.830938e+02
model_year                                         5.034332e+03
milage                                            -7.318672e+03
Length: 3323, dtype: float64

In [15]:
# Predict on the test set
y_pred = lr.predict(X_val)

In [16]:
# Calculate RMSE
rmse = mean_squared_error(y_val, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 90447676932314.36


In [17]:
# Define the pipeline with XGBRegressor
xgb_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=43))
])


In [18]:
# Fit the pipeline to the training data
xgb_pipe.fit(X_train, y_train)

# Predict on the test set
y_pred = xgb_pipe.predict(X_val)

In [19]:
# Calculate RMSE
rmse = mean_squared_error(y_val, y_pred, squared=False)
print("Root Mean Squared Error (RMSE):", rmse)

Root Mean Squared Error (RMSE): 67956.16780896361


In [20]:
# Access the XGBoost model within the pipeline
xgb_model = xgb_pipe.named_steps['regressor']
print("Feature Importances:", xgb_model.feature_importances_)

Feature Importances: [2.2999393e-03 1.3525285e-04 5.5686585e-05 ... 4.1475371e-03 7.8849252e-03
 7.7175982e-03]


In [21]:
# Get the feature names after transformation
ohe = xgb_pipe.named_steps['preprocessor'].named_transformers_['cat']
ohe_feature_names = ohe.get_feature_names_out(cat)
num_feature_names = num
feature_names = list(ohe_feature_names) + num_feature_names

# print("Feature Names:", feature_names)

In [23]:
 # Check the ranges of y_test and y_pred
print("Range of y_test:", (np.min(y_val), np.max(y_val)))
print("Range of y_pred:", (np.min(y_pred), np.max(y_pred)))

Range of y_test: (2000, 2954083)
Range of y_pred: (-106238.08, 2383545.5)


In [24]:
X_test = pd.read_csv("test.csv")  # REMOVERHS
print(X_test.info())
X_test.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36183 entries, 0 to 36182
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            36183 non-null  int64 
 1   brand         36183 non-null  object
 2   model         36183 non-null  object
 3   model_year    36183 non-null  int64 
 4   milage        36183 non-null  int64 
 5   fuel_type     36183 non-null  object
 6   engine        36183 non-null  object
 7   transmission  36183 non-null  object
 8   ext_col       36183 non-null  object
 9   int_col       36183 non-null  object
 10  accident      36183 non-null  object
 11  clean_title   36183 non-null  object
dtypes: int64(3), object(9)
memory usage: 3.3+ MB
None


Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title
0,54273,Mercedes-Benz,E-Class E 350,2014,73000,Gasoline,302.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,White,Beige,None reported,Yes
1,54274,Lexus,RX 350 Base,2015,128032,Gasoline,275.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,8-Speed A/T,Silver,Black,None reported,Yes
2,54275,Mercedes-Benz,C-Class C 300,2015,51983,Gasoline,241.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,7-Speed A/T,Blue,White,None reported,Yes
3,54276,Land,Rover Range Rover 5.0L Supercharged Autobiogra...,2018,29500,Gasoline,518.0HP 5.0L 8 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,White,White,At least 1 accident or damage reported,Yes
4,54277,BMW,X6 xDrive40i,2020,90000,Gasoline,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,8-Speed A/T,White,Black,At least 1 accident or damage reported,Yes


In [25]:
y_test_pred = pd.Series(xgb_pipe.predict(X_test))  # REMOVERHS
y_test_pred.head()

0    23215.972656
1    21023.255859
2    34825.054688
3    50828.570312
4    42716.855469
dtype: float32

In [26]:
feat_imp = (pd
            .Series(xgb_model.feature_importances_, index=feature_names)
            .sort_values(key=abs, ascending=False)
           ) 
feat_imp

model_SL-Class SL500 Roadster                       0.060066
model_Wrangler Sport                                0.055899
model_RDX PMC Edition                               0.027333
model_Suburban 1500 LTZ                             0.027291
model_Silverado 1500 LTZ                            0.024634
                                                      ...   
model_S60 Recharge Plug-In Hybrid T8 Inscription    0.000000
model_Highlander Hybrid XLE                         0.000000
model_911 GT2 RS                                    0.000000
model_Camry SE                                      0.000000
model_Mustang SVT Cobra                             0.000000
Length: 3323, dtype: float32

In [27]:
# Prepare the submission file
submission_example = pd.read_csv('sample_submission.csv')
submission = pd.DataFrame({'id': submission_example['id'], 'target': y_test_pred})
submission.to_csv('submission.csv', index=False)