In [42]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn import metrics

import warnings
warnings.simplefilter('ignore')

In [2]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    """
    The class provides basic functionality for retrieving
    a subset of columns from the dataset.
    """
    
    def __init__(self, feature_names):
        """
        Initialize class instance by setting
        a list of columns to retrieve from the dataset.
        """
        BaseEstimator.__init__(self)
        TransformerMixin.__init__(self)
        self.feature_names = feature_names
        
    def fit(self, X, y=None):
        """
        Fit FeatureSelector to X, but really do nothing.
        Return self.
        """
        return self
    
    def transform(self, X, y=None):
        """
        Transform X using feature selection. 
        Return column-subset of X.
        """
        return X[self.feature_names]

In [3]:
class FeatureGenerator(BaseEstimator, TransformerMixin):
    """
    Required columns: table. X is DataFrame.
    """
    
    features = {
             'ABS (антиблокировочная система)',
             'AUX/iPod',
             'Bluetooth',
             'CD/MP3 проигрыватель',
             'ESP (система поддержания динамической стабильности)',
             'USB',
             'Автозапуск двигателя',
             'Антипробуксовочная система',
             'Датчик дождя',
             'Иммобилайзер',
             'Камера заднего вида',
             'Климат-контроль',
             'Кондиционер',
             'Контроль мертвых зон на зеркалах',
             'Круиз-контроль',
             'Ксеноновые фары',
             'Легкосплавные диски',
             'Люк',
             'Материал салона - натуральная кожа',
             'Мультимедийный экран',
             'Обогрев зеркал',
             'Обогрев лобового стекла',
             'Обогрев руля',
             'Обогрев сидений',
             'Панорамная крыша',
             'Парктроники',
             'Подушки безопасности боковые',
             'Подушки безопасности задние',
             'Подушки безопасности передние',
             'Противотуманные фары',
             'Рейлинги на крыше',
             'Светодиодные фары',
             'Сигнализация',
             'Системы помощи',
             'Управление мультимедиа с руля',
             'Фаркоп',
             'Штатная навигация',
             'Электрорегулировка сидений',
             'Электростеклоподъемники задние',
             'Электростеклоподъемники передние'}
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        self.add_features(X)
        return X[self.features].values
    
    def add_features(self, X):
        for imp in self.features:
            X[imp] = X.table.apply(lambda x: int(imp in x)).astype('int8')

In [4]:
class ToIntTransformer(BaseEstimator, TransformerMixin):
    """
    Required columns: volume, show, run, pages, update, year.
    """
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        # self.to_int(X, 'cost', ' ')
        self.to_int(X, 'volume', ' см3')
        self.fix_show(X)
        self.fix_run(X)
        self.fix_dates(X)
        self.add_restyle(X)
        self.add_upd_flags(X)
        self.year_to_old(X)
        return X.drop(['model', 'update', 'pages', 'year'], axis=1).values
    
    def to_int(self, X, column_name, phrase):
        X[column_name] = X[column_name].str.replace(phrase, '').astype('int32')
        
    def year_to_old(self, X):
        X['age'] = (2019 - X['year']).astype('int8')
        
    def fix_show(self, X):
        X['today_views'] = X['show'].str.extract('\+(.+) ')
        X['show'] = X['show'].str.extract('(.*)' + ' '*25)
        X.rename(columns={'show': 'all_views'}, inplace=True)
        today_view_mask = pd.isna(X['today_views'])
        X.loc[today_view_mask, 'today_views'] = X[today_view_mask]['all_views']
        X['today_views'] = X['today_views'].astype('int')
        X['all_views'] = X['all_views'].astype('int')
        
    def fix_run(self, X):
        X['run'] = X['run'].str.replace(' км', '')
        miles_mask = X['run'].str.endswith(' миль')
        X.loc[miles_mask, 'run'] = X[miles_mask]['run'].str.replace(' миль', '').astype('int') * 1.60934
        X['run'] = X['run'].astype('int')
        
    def add_restyle(self, X):
        self.create_model(X)
        X['is_restyle'] = X['model'].str.endswith('(рестайлинг)').astype('int8')
        
    def add_upd_flags(self, X):
        X['modified'] = X['update'].apply(
            lambda x: int(not x.split()[0] == 'Опубликовано')).astype('int8')
        X['up'] = X['update'].apply(
            lambda x: int(len(x.split()) == 4)).astype('int8')
    
    def create_model(self, X):
        two_word_names = ('Alfa Romeo', 'Great Wall', 'Lada (ВАЗ)')
        two_word_names_mask = X['pages'].str.startswith(two_word_names)
        X.loc[two_word_names_mask, 'model'] = (X[two_word_names_mask]['pages'].str.split()
                                                .apply(lambda name: ' '.join(name[2:])))
        X.loc[~two_word_names_mask, 'model'] = (X[~two_word_names_mask]['pages'].str.split()
                                                 .apply(lambda name: ' '.join(name[1:])))
    
    def fix_dates(self, df):
        today=pd.Timestamp(2019, 11, 23)
        df['days_ago'] = df['update'].apply(
            lambda x: (today - pd.Timestamp(x.split()[1])).days)

In [50]:
class CatTransformer(BaseEstimator, TransformerMixin):
    """
    Required columns: cuzov, fuel, pages, region, update.
    """    
    
    def __init__(self):
        self.preserve = []
    
    def fit(self, X, y=None):
        X = X.copy()
        two_word_names = ('Alfa Romeo', 'Great Wall', 'Lada (ВАЗ)')
        two_word_names_mask = X['pages'].str.startswith(two_word_names)
        X.loc[two_word_names_mask, 'pages'] = (X[two_word_names_mask]['pages'].str.split()
                                                .apply(lambda name: ' '.join(name[:2])))
        X.loc[~two_word_names_mask, 'pages'] = (X[~two_word_names_mask]['pages'].str.split()
                                                 .apply(lambda name: ' '.join(name[:1])))
        vc = X.pages.value_counts() / len(X)
        self.preserve = vc[vc > 0.01]
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        self.cut(X, ['cuzov', 'fuel'])
        self.fix_names(X)
        self.fix_region(X)
        return X.drop(['update'], axis=1).values
    
    def cut(self, X, column_names):
        for col in column_names:
            X[col] = X[col].apply(lambda x: x.split()[0])
        
    def fix_names(self, df):
        two_word_names = ('Alfa Romeo', 'Great Wall', 'Lada (ВАЗ)')
        two_word_names_mask = df['pages'].str.startswith(two_word_names)
        df.loc[two_word_names_mask, 'pages'] = (df[two_word_names_mask]['pages'].str.split()
                                                .apply(lambda name: ' '.join(name[:2])))
        df.loc[~two_word_names_mask, 'pages'] = (df[~two_word_names_mask]['pages'].str.split()
                                                 .apply(lambda name: ' '.join(name[:1])))
        df.rename(columns={'pages': 'brand'}, inplace=True)
        vc = df.brand.value_counts() / len(df)
        df.brand = df.brand.apply(lambda x: x if x in self.preserve else 'other')
        
    def fix_region(self, X):
        
        def _get_region(lst):
            if len(lst) == 1:
                return lst[0]
            return lst[1]
        
        tmp = list(map(lambda s: s.split(', '), X.region))
        X.region = list(map(_get_region, tmp))


In [51]:
class ColumnTranslation:
    
    def __init__(self, column_name, to_save, default='Other'):
        self.column_name = column_name
        self.to_save = to_save
        self.default = default


class Translator(BaseEstimator, TransformerMixin):
    
    def __init__(self, translations):
        self.translations = translations[:]
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X = X.copy()
        for tr in self.translations:
            X[tr.column_name] = X[tr.column_name].apply(
                lambda x: x if x in tr.to_save else tr.default)
        return X[[tr.column_name for tr in self.translations]].values

In [52]:
avto = pd.read_csv('dataset/final.csv')

X = avto.drop(['cost'], axis=1)
y = avto.cost.apply(lambda x: int(x.replace(' ', '')))
X_train, X_test, y_train, y_test = train_test_split(X, y ,test_size=0.2, random_state=42)

feat_pipeline = Pipeline(
    steps=[
        ('feat_selector', FeatureSelector(['table'])),
        ('feat_generator', FeatureGenerator())
    ]
)

int_pipeline = Pipeline(
    steps=[
        ('int_selector', FeatureSelector(['volume', 'show', 'run', 'pages', 'update', 'year'])),
        ('int_transformer', ToIntTransformer())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('cat_selector', FeatureSelector(['cuzov', 'fuel', 'pages', 'region', 'update'])),
        ('cat_transformer', CatTransformer()),
        ('cat_encoder', OneHotEncoder())
    ]
)

color_pipeline = Pipeline(
    steps=[
        ('color_selector', FeatureSelector(['color'])),
        ('color_translator', Translator(
            [ColumnTranslation(
                column_name='color',
                to_save=['черный', 'серебристый', 'синий', 'серый', 'белый'],
                default='другой'
            )])),
        ('color_encoder', OneHotEncoder())
    ]
)

no_proc_pipeline = Pipeline(
    steps=[
        ('no_proc_selector', FeatureSelector(['drive-unit', 'state', 'transmission'])),
        ('no_proc_imputer', SimpleImputer(strategy='most_frequent')),
        ('no_proc_encoder', OneHotEncoder())
    ]
)

pipeline = FeatureUnion(transformer_list= [
    ('feat', feat_pipeline),
    ('int', int_pipeline),
    ('cat', cat_pipeline),
    ('color', color_pipeline),
    ('no_proc', no_proc_pipeline)
])

In [61]:
import xgboost as xg
import lightgbm as lgb

In [63]:
xg_params = [
    {
        'max_depth': [3, 5, 7, 9, 11],
        'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.2],
        'n_estimators': [100, 200],
        'gamma': [0.1, 0.25],
    }
]

pip = Pipeline(
    steps=[
        ('preprocessing', pipeline),
        ('gc', GridSearchCV(xg.XGBRegressor(
            random_state=42,
            n_jobs=-1,
            objective='reg:squarederror'
        ), xg_params, scoring='r2', refit=True, n_jobs=-1, cv=5))
    ]
)

pip.fit(X_train, y_train)
y_true, y_pred = y_test, pip.predict(X_test)
print(metrics.r2_score(y_true, y_pred))

0.8520849040443894


In [60]:
lgb_params = [
    {
        'boosting_type': ['gbdt', 'goss'],
        'num_leaves': [50, 73, 100, 120, 150, 160, 180, 230, 255],
        'max_depth': [5, 7, 9, 11],
        'learning_rate': [0.01, 0.03, 0.05, 0.1, 0.15, 0.2],
        'n_estimators': [100, 150, 200, 250],
        
    }
]

pip = Pipeline(
    steps=[
        ('preprocessing', pipeline),
        ('gc', GridSearchCV(lgb.LGBMRegressor(random_state=42),
                            lgb_params, scoring='r2', n_jobs=-1, 
                            cv=5, refit=True, verbose=5))
    ]
)

pip.fit(X_train, y_train)
y_true, y_pred = y_test, pip.predict(X_test)
print(metrics.r2_score(y_true, y_pred))

Fitting 5 folds for each of 1728 candidates, totalling 8640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   11.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-1)]: Done 1144 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done 1450 tasks      | elapsed:  8.4min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 2170 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 2584 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 3034 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 3520 tasks      | elapsed: 15.4min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | ela

0.8628926435911706


In [62]:
pip['gc'].best_params_

{'boosting_type': 'gbdt',
 'learning_rate': 0.1,
 'max_depth': 11,
 'n_estimators': 250,
 'num_leaves': 50}