In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from preprocess import PreprocessedDataFrame
from category_encoders import TargetEncoder
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

pd.set_option("display.max_rows", None, "display.max_columns", None)

In [2]:
FOLDER = ''
X = pd.read_csv(FOLDER + 'train.csv')
y= X.pop('price')
print('Total data size:', X.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)
print('Train:', X_train.shape)
print('Test:', X_test.shape)

X_train.head(1)

Total data size: (16784, 29)
Train: (13427, 29)
Test: (3357, 29)


Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,transmission,curb_weight,power,fuel_type,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,opc_scheme,lifespan,eco_category,features,accessories,indicative_price
15946,1000462,Porsche Cayenne Hybrid S 3.0A,porsche,cayenne,10,2012.0,,22-oct-2012,suv,"parf car, direct owner sale, hybrid cars",auto,2315.0,245.0,petrol-electric,2995.0,4.0,67590.0,82289.0,2380.0,43327.0,125000.0,102465.0,61479.0,,,uncategorized,3.0l v6 supercharged/hybrid engine with 328bhp...,low mileage! 2 keys! keyless entry/start. bi-x...,


## Preprocessing

### Train data

In [3]:
preprocessed_df = PreprocessedDataFrame(X_train, y_train, target_encoding=True) # set target_encoding False to get one-hot encoding for make and model

Date: ['reg_date', 'manufactured', 'lifespan', 'original_reg_date']
Num: ['curb_weight', 'engine_cap', 'power', 'road_tax', 'omv', 'mileage', 'arf', 'dereg_value', 'no_of_owners', 'indicative_price', 'depreciation', 'coe']
OneHot Cat: ['title', 'make', 'model']
Label Cat: ['description', 'accessories', 'transmission', 'category', 'fuel_type', 'opc_scheme', 'eco_category', 'listing_id', 'features']



In [4]:
X_train_prepared = preprocessed_df.build_dataframe()

Input shape: (13427, 29)
Transformed shape: (13427, 43)


In [5]:
model_target_encoder = TargetEncoder()
X_train_prepared.loc[:, 'model'] = model_target_encoder.fit_transform(X_train_prepared.pop('model'), y=y_train)

make_target_encoder = TargetEncoder()
X_train_prepared.loc[:, 'make'] = make_target_encoder.fit_transform(X_train_prepared.pop('make'), y=y_train)

imputer = SimpleImputer()
X_train_prepared = imputer.fit_transform(X_train_prepared)

### Test data

In [6]:
X_test_prepared = preprocessed_df.transform_dataframe(X_test)
X_test_prepared.loc[:, 'model'] = model_target_encoder.transform(X_test_prepared.pop('model'))
X_test_prepared.loc[:, 'make'] = make_target_encoder.transform(X_test_prepared.pop('make'))
X_test_prepared = imputer.transform(X_test_prepared)

Input shape: (3357, 29)
Transformed shape: (3357, 43)


## Model

In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score

In [8]:
def rmse(model, data, labels):
    predictions = model.predict(data)
    mse = mean_squared_error(labels, predictions)
    rmse = np.sqrt(mse)
    print(rmse)
    
def display_scores(scores):
    print("Scores:", scores)
    print(f"Mean: \033[1m{scores.mean()}\033[0m", )
    print("Standard deviation:", scores.std())
    print()
    
def cvs(model, data, labels):
    scores = cross_val_score(model, data, labels, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    return rmse_scores

In [None]:
models = []
models.append(('lin_reg', LinearRegression()))
models.append(('tree_reg', DecisionTreeRegressor(random_state=42)))
models.append(('forest_reg', RandomForestRegressor(n_estimators=30, random_state=42)))
models.append(('svm_reg', SVR()))
models.append(('mlp_reg', MLPRegressor()))
models.append(('xgb_reg', XGBRegressor()))
models.append(('light_reg', LGBMRegressor()))
models.append(('cat_reg', CatBoostRegressor(silent=True)))

results = []
names = []
for name, model in models:
    reg = model.fit(X_train_prepared, y_train)
    scores = cvs(reg, X_train_prepared, y_train)
    # display_scores(rmse_scores)
    results.append(scores)
    names.append(name)
    print(f'{name} \033[1m{scores.mean()}\033[0m {scores.std()}')
    
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

lin_reg [1m42898.83710232035[0m 7217.7572571817645
tree_reg [1m31982.876035284018[0m 5881.758115417958
forest_reg [1m24752.477852952816[0m 6124.890665579672
svm_reg [1m132709.4297504642[0m 15662.163297978508


### Grid search

In [None]:
n_estimators = [200, 400, 600, 800, 1000]
learning_rate  = [0.01, 0.1, 0.2]

param_grid = [
    {'n_estimators': n_estimators, 'learning_rate': learning_rate},
  ]

regressor = LGBMRegressor(random_state=42)
grid_search = GridSearchCV(regressor, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(X_train_prepared, y_train)

### Grid search results

In [None]:
print(grid_search.best_params_, np.sqrt(-grid_search.cv_results_['mean_test_score'].max()))
print(grid_search.best_estimator_)

In [None]:
cvres = grid_search.cv_results_
result_df = pd.DataFrame(grid_search.cv_results_)
result_df['mean_test_score'] = np.sqrt(-result_df['mean_test_score']).round(2)
result_df['std_test_score'] = np.sqrt(result_df['std_test_score']).round(2)
result_df.loc[:, [col for col in result_df if col.startswith('param')] + ['mean_test_score', 'std_test_score', 'rank_test_score']].sort_values('rank_test_score')

### Feature importance

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
print('feature_importances', len(feature_importances))

In [None]:
important_features = sorted(zip(feature_importances, preprocessed_df.transformed_attribs), reverse=True)
important_features[:10]

In [None]:
df = pd.DataFrame(important_features, columns=["value", "feature"])
df["abs_value"] = df["value"].apply(lambda x: abs(x))
df["colors"] = df["value"].apply(lambda x: "green" if x > 0 else "red")
df = df.sort_values("abs_value", ascending=False)

fig, ax = plt.subplots(1, 1, figsize=(12, 7))
sns.barplot(x="feature",
            y="value",
            data=df.head(20),
           palette=df.head(20)["colors"])
ax.set_xticklabels(ax.get_xticklabels(), rotation=90, fontsize=20)
ax.set_title("Top 20 Features", fontsize=25)
ax.set_ylabel("Coef", fontsize=22)
ax.set_xlabel("Feature Name", fontsize=22)

## Test

In [None]:
final_model = grid_search.best_estimator_
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

### Ensemble

In [None]:
estimators = [
    ('xgb_reg', XGBRegressor(n_estimators=30, n_jobs=-1)),
    # ('light_reg', LGBMRegressor(n_jobs=-1)),
    ('cat_reg', CatBoostRegressor(silent=True)),
    ('rd_reg', RandomForestRegressor(n_estimators=30, random_state=42)),
]
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=LinearRegression()
)
final_model = reg

reg.fit_transform(X_train_prepared, y_train)
final_predictions = reg.predict(X_test_prepared)
final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

## Submission

In [None]:
test_df = pd.read_csv(FOLDER + 'test.csv')

test_prepared = preprocessed_df.transform_dataframe(test_df)
test_prepared.loc[:, 'model'] = model_target_encoder.transform(test_prepared.pop('model'))
test_prepared.loc[:, 'make'] = make_target_encoder.transform(test_prepared.pop('make'))
test_prepared = imputer.transform(test_prepared)

test_pred = final_model.predict(test_prepared)

print('Test shape:', test_prepared.shape)
print("Predictions:", test_pred)

In [None]:
submission = pd.DataFrame()
submission['Predicted'] = test_pred
submission.reset_index(inplace=True)
submission = submission.rename(columns = {'index':'Id'})
submission.to_csv('submission.csv', index=False)
display(submission.head(10))