In [1]:
import pandas as pd
import numpy as np

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
train_df = train_df.drop('id', axis = 1)
test_df = test_df.drop('id', axis = 1)

In [4]:
test_df.loc[142, 'engine'] = '8-cylinder(V8)'

In [5]:
train_df['engine'] = train_df.engine.replace('4-cylinder(H4)', '4-cylinder(I4)')

In [6]:
train_df.engine.unique(), test_df.engine.unique()

(array(['4-cylinder(I4)', '6-cylinder(V6)', '8-cylinder(V8)',
        '6-cylinder(I6)', '5-cylinder(I5)', '3-cylinder(I3)',
        '2-cylinder(I2)'], dtype=object),
 array(['4-cylinder(I4)', '6-cylinder(V6)', '6-cylinder(I6)',
        '8-cylinder(V8)', '5-cylinder(I5)', '2-cylinder(I2)',
        '3-cylinder(I3)'], dtype=object))

In [7]:
train_df.loc[train_df.year == 1218, 'year'] = 2001
train_df.loc[train_df.year == 1217, 'year'] = 2001

In [8]:
test_df.loc[test_df.year == 1324, 'year'] = 2001
test_df.loc[test_df.year == 2626, 'year'] = 1994
test_df.loc[test_df.year == 1726, 'year'] = 1994

In [9]:
import re

def apply_brand(x):
    x = re.sub('[^a-zA-Z]', '', x)
    
    return x

def preprocessing_title(data):
    '''title 전처리 함수'''
    
    # 'brand' 파생변수 생성
    data['brand'] = data['title'].apply(lambda x : x.split()[0])
    data['brand'] = data['brand'].apply(apply_brand)
    # title 삭제
    data = data.drop('title', axis=1)
    
    return data

In [10]:
train_df = preprocessing_title(train_df)
test_df = preprocessing_title(test_df)

In [11]:
test_df.loc[test_df.brand == 'Fiat', 'brand'] = 'Peugeot'

train_df.loc[train_df.brand == 'Jaguar', 'brand'] = 'Toyota'
train_df.loc[train_df.brand == 'Lincoln', 'brand'] = 'Land'
train_df.loc[train_df.brand == 'Isuzu', 'brand'] = 'DAF'
train_df.loc[train_df.brand == 'IVM', 'brand'] = 'Nissan'
train_df.loc[train_df.brand == 'Dodge', 'brand'] = 'Toyota'
train_df.loc[train_df.brand == 'GMC', 'brand'] = 'Toyota'
train_df.loc[train_df.brand == 'Scania', 'brand'] = 'Volvo'
train_df.loc[train_df.brand == 'GAC', 'brand'] = 'Toyota'
train_df.loc[train_df.brand == 'IVECO', 'brand'] = 'Toyota'
train_df.loc[train_df.brand == 'MANVOLKSWAGEN', 'brand'] = 'MercedesBenz'
train_df.loc[train_df.brand == 'ALPINA', 'brand'] = 'Mack'
train_df.loc[train_df.brand == 'Bentley', 'brand'] = 'Land'
train_df.loc[train_df.brand == 'JMC', 'brand'] = 'Toyota'
train_df.loc[train_df.brand == 'Audi', 'brand'] = 'MercedesBenz'

In [12]:
# location의 공백과 State를 제거하는 함수
def clean_location(x):
    '''location 정리 함수'''
    x = x.replace(' ', '')
    x = x.replace('State', '')
    return x

In [13]:
train_df.location = train_df.location.apply(clean_location)
test_df.location = test_df.location.apply(clean_location)

In [14]:
test_df.loc[test_df.location == 'Mushine', 'location'] = 'Lagos'
test_df.loc[test_df.location == 'Arepoogunstate', 'location'] = 'Lagos'

train_df.loc[train_df.location == 'FCT', 'location'] = 'Lagos'
train_df.loc[train_df.location == 'Accra', 'location'] = 'Lagos'
train_df.loc[train_df.location == 'Adamawa', 'location'] = 'Lagos'

In [15]:
def apply_paint(x):
    x = re.sub('[^a-zA-Z]', '', x)
    x = x.lower()
    
    if x.find('black') >= 0:
        x = 'black'
    elif x.find('white') >= 0:
        x = 'white'
    elif (x.find('grey') >= 0) | (x.find('gray') >= 0):
        x = 'gray'
    elif (x.find('silver') >= 0) | (x.find('sliver') >= 0):
        x = 'silver'
    elif x.find('blue') >= 0:
        x = 'blue'
    elif x.find('red') >= 0:
        x = 'red'
    elif x.find('gold') >= 0:
        x = 'gold'
    elif x.find('green') >= 0:
        x = 'green'
    else:
        x = 'others'
        
    return x

def preprocessing_paint(data):
    '''paint 전처리 함수'''
    
    data.paint = data.paint.apply(apply_paint)
    
    return data

In [16]:
train_df = preprocessing_paint(train_df)
test_df = preprocessing_paint(test_df)

In [17]:
train_y = train_df.loc[:, ['target']]
#train_X = train_df.drop('target', axis=1)
train_X = train_df.copy()

In [18]:
encoded_train_X = pd.get_dummies(train_X, ['location', 'isimported', 'engine', 'transmission', 'fuel', 'paint', 'brand'])
encoded_test_X = pd.get_dummies(test_df, ['location', 'isimported', 'engine', 'transmission', 'fuel', 'paint', 'brand'])

In [19]:
from sklearn.preprocessing import MinMaxScaler

mms_train = MinMaxScaler()

norm_encoded_train_X = mms_train.fit_transform(encoded_train_X, encoded_train_X['target'])

mms_test = MinMaxScaler()

norm_encoded_test_X = mms_test.fit_transform(encoded_test_X)

In [20]:
norm_encoded_train_X

array([[1.85116912e-02, 9.00000000e-01, 8.86608916e-02, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.01284080e-05, 9.75000000e-01, 2.17992837e-01, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [8.41579545e-02, 8.00000000e-01, 6.35965611e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [9.80450147e-02, 7.75000000e-01, 2.01517217e-02, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.48153287e-01, 8.25000000e-01, 9.30053755e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 4.50000000e-01, 6.42649433e-02, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [21]:
from pycaret.regression import *

  import mlflow


In [22]:
py_reg = setup(encoded_train_X, target = 'target', session_id = 42, silent = True)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,target
2,Original Data,"(1015, 58)"
3,Missing Values,False
4,Numeric Features,57
5,Categorical Features,0
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(710, 55)"


In [23]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,2732456.507,47868803187167.61,6359277.4648,0.6419,0.4228,0.3619,0.343
catboost,CatBoost Regressor,2826945.6474,53794799102991.56,6601994.1807,0.6171,0.4767,0.3755,0.687
gbr,Gradient Boosting Regressor,2899316.3453,55864454066822.24,6836748.866,0.5843,0.4542,0.4095,0.051
lightgbm,Light Gradient Boosting Machine,3407479.4242,57393366491188.24,7105278.7768,0.5436,0.5949,0.5134,0.036
et,Extra Trees Regressor,3079015.838,70608739824155.73,7740404.5994,0.442,0.4651,0.422,0.364
ridge,Ridge Regression,4410251.2394,71072932009535.84,7947563.2577,0.427,0.8313,1.0675,0.012
lr,Linear Regression,4474568.8507,71963038611917.9,8002230.4344,0.4176,0.8307,1.0918,0.67
llar,Lasso Least Angle Regression,4485901.6569,72196811167642.64,8032707.6036,0.4103,0.8298,1.0965,0.013
lasso,Lasso Regression,4491281.031,72277317957377.1,8038062.3797,0.4092,0.8308,1.0989,0.016
omp,Orthogonal Matching Pursuit,4341858.8997,74933009187699.16,8220488.1007,0.3781,0.759,0.9621,0.01


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=42, verbose=0, warm_start=False)

In [24]:
rf = create_model('rf', verbose=False)
gbr = create_model('gbr', verbose=False)
catboost = create_model('catboost', verbose=False)

In [25]:
blended_model = blend_models(estimator_list = [rf, gbr, catboost])

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2839085.9177,24621413922858.01,4961996.9693,0.7843,0.437,0.3773
1,2460719.1883,24659131029439.33,4965796.1124,0.6231,0.452,0.4348
2,2309092.3406,21459098895649.1,4632396.6686,0.6174,0.3473,0.2995
3,2936406.1209,101617323760680.69,10080541.8386,0.5691,0.4391,0.3545
4,3902893.6969,168687713852233.8,12987983.4406,0.5547,0.474,0.3495
5,2865002.8312,33360276081414.867,5775835.531,0.7574,0.3896,0.3399
6,1594977.7855,9289106444641.979,3047803.5443,0.7886,0.298,0.2708
7,1933036.4167,13483651847932.428,3672009.2385,0.8099,0.3666,0.3328
8,4028810.5353,73451658476872.02,8570394.3011,0.4579,0.5179,0.4629
9,2390445.7454,24087855852691.223,4907938.0449,0.5318,0.4412,0.4203


In [26]:
final_model = finalize_model(blended_model)
prediction = predict_model(final_model, data = encoded_test_X)

In [30]:
y_pred = prediction['Label']

In [31]:
submission = pd.read_csv('sample_submission.csv')
submission['target'] = y_pred
submission.head()

Unnamed: 0,id,target
0,0,18289900.0
1,1,5452166.0
2,2,6524041.0
3,3,1544654.0
4,4,2550157.0


In [32]:
submission.to_csv('submit5.csv', index=False)