In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split

X_full = pd.read_csv('New_cars_cleaned.csv')
X_full.dropna(axis = 0, subset = ['MSRP'], inplace = True)
y = X_full.MSRP
X_full.drop(['MSRP'], axis = 1)


X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, train_size = 0.8, test_size = 0.2, 
                                                            random_state = 0)
cat_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype == 'object']
num_cols = [cname for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

my_cols = cat_cols + num_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

numerical_transformer = SimpleImputer(strategy = 'constant')

categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
])

preprocessor = ColumnTransformer(transformers = [
    ('num', numerical_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

In [4]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators = 100, random_state = 0)

In [None]:
from sklearn.metrics import mean_absolute_error

my_pipeline = Pipeline(steps = [
    ('preprocessor', preprocessor),
    ('model', model)
])

my_pipeline.fit(X_train, y_train)

preds = my_pipeline.predict(X_valid)
print(mean_absolute_error(y_valid, preds))