In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/DontGetKicked/example_entry.csv
/kaggle/input/DontGetKicked/training.zip
/kaggle/input/DontGetKicked/Carvana_Data_Dictionary.txt
/kaggle/input/DontGetKicked/test.zip
/kaggle/input/DontGetKicked/training.csv
/kaggle/input/DontGetKicked/test.csv


In [2]:
pd.options.display.max_columns = 99

In [3]:
train = pd.read_csv('/kaggle/input/DontGetKicked/training.csv')
test = pd.read_csv('/kaggle/input/DontGetKicked/test.csv')

In [4]:
def preprocessing(data):
    train = data.copy()
    train.drop(['PRIMEUNIT','AUCGUART'],axis=1, inplace = True)
    for i in range(0,4):   # 결측치 처리
        m = train[train['WheelTypeID']== i].Model.unique()
        train.loc[(train['Model'].isin(m)) & (np.nan != np.nan), 'WheelTypeID']= i
    
    for i in ['Alloy','Covers','Special']: # 결측치 처리
        m = train[train['WheelType']== i].Model.unique()
        train.loc[(train['Model'].isin(m)) & (np.nan != np.nan), 'WheelType']= i
        
    k = train.groupby(['Model','Trim']).count().reset_index()
    t = train.loc[train['Trim'] != train['Trim'],'Model'].unique()

    for col in tqdm(t):
        m = k[k['Model'] == col].RefId.max()
        p = k[(k['Model'] == col)&(k['RefId'] == m)].Trim.values
        if not p.size == 0: 
            train.loc[(train['Trim'] != train['Trim']) & (train['Model'] == col),'Trim'] = p[0]
    
    k = train.groupby(['Make','Trim']).count().reset_index()
    t = train.loc[train['Trim'] != train['Trim'],'Make'].unique()

    for col in tqdm(t):
        m = k[k['Make'] == col].RefId.max()
        p = k[(k['Make'] == col)&(k['RefId'] == m)].Trim.values
        if not p.size == 0: 
            train.loc[(train['Trim'] != train['Trim']) & (train['Make'] == col),'Trim'] = p[0] 
    
    train.loc[train['Trim']!=train['Trim'],'Trim'] = 'Bas'
    
    k = train.groupby(['Model','SubModel']).count().reset_index()
    t = train.loc[train['SubModel'] != train['SubModel'],'Model'].unique()

    for col in tqdm(t):
        m = k[k['Model'] == col].RefId.max()
        p = k[(k['Model'] == col)&(k['RefId'] == m)].Trim.values
        if not p.size == 0: 
            train.loc[(train['SubModel'] != train['SubModel']) & (train['Model'] == col),'SubModel'] = p[0]
    
    k = train.groupby('Model')['MMRCurrentAuctionAveragePrice','MMRCurrentRetailAveragePrice','MMRCurrentRetailCleanPrice','MMRCurrentAuctionCleanPrice'].mean().reset_index()
    t = train[train.MMRCurrentAuctionAveragePrice.isnull()].Model.unique()

    for col in tqdm(t):
        train.loc[(train['MMRCurrentAuctionAveragePrice'] != train['MMRCurrentAuctionAveragePrice']) & (train['Model'] == col),'MMRCurrentAuctionAveragePrice'] = k[k.Model==col].MMRCurrentAuctionAveragePrice.values[0]
        train.loc[(train['MMRCurrentRetailAveragePrice'] != train['MMRCurrentRetailAveragePrice']) & (train['Model'] == col),'MMRCurrentRetailAveragePrice'] = k[k.Model==col].MMRCurrentRetailAveragePrice.values[0]
        train.loc[(train['MMRCurrentRetailCleanPrice'] != train['MMRCurrentRetailCleanPrice']) & (train['Model'] == col),'MMRCurrentRetailCleanPrice'] = k[k.Model==col].MMRCurrentRetailCleanPrice.values[0]
        train.loc[(train['MMRCurrentAuctionCleanPrice'] != train['MMRCurrentAuctionCleanPrice']) & (train['Model'] == col),'MMRCurrentAuctionCleanPrice'] = k[k.Model==col].MMRCurrentAuctionCleanPrice.values[0]
    
     # 결측치 처리
    train.loc[train['Nationality']!=train['Nationality'], 'Nationality'] = 'OTHER'
    train.loc[train['WheelType']!=train['WheelType'], 'WheelType'] = 'Alloy'
    train.loc[train['Color']!=train['Color'], 'Color'] = 'SILVER'
    train.loc[train['Size']!=train['Size'], 'Size'] = 'MEDIUM'
    train.loc[train['TopThreeAmericanName']!=train['TopThreeAmericanName'], 'TopThreeAmericanName'] = 'OTHER'
    train.loc[train['Transmission']!=train['Transmission'], 'Transmission'] = 'AUTO'    
    
    train.loc[train.MMRCurrentAuctionAveragePrice != train.MMRCurrentAuctionAveragePrice, 'MMRCurrentAuctionAveragePrice'] = train.VehBCost * 0.8# 평균 값으로 샀다고 가정
    train.loc[train.MMRCurrentRetailAveragePrice != train.MMRCurrentRetailAveragePrice, 'MMRCurrentRetailAveragePrice'] = train.VehBCost * 0.8
    train.loc[train.MMRCurrentRetailCleanPrice != train.MMRCurrentRetailCleanPrice, 'MMRCurrentRetailCleanPrice'] = train.VehBCost * 0.8
    train.loc[train.MMRCurrentAuctionCleanPrice != train.MMRCurrentAuctionCleanPrice, 'MMRCurrentAuctionCleanPrice'] = train.VehBCost * 0.8

    train.loc[train.MMRAcquisitionAuctionAveragePrice != train.MMRAcquisitionAuctionAveragePrice, 'MMRAcquisitionAuctionAveragePrice'] = train.VehBCost  # 평균 값으로 샀다고 가정
    train.loc[train.MMRAcquisitionRetailAveragePrice != train.MMRAcquisitionRetailAveragePrice, 'MMRAcquisitionRetailAveragePrice'] = train.VehBCost 
    train.loc[train.MMRAcquisitonRetailCleanPrice != train.MMRAcquisitonRetailCleanPrice, 'MMRAcquisitonRetailCleanPrice'] = train.VehBCost 
    train.loc[train.MMRAcquisitionAuctionCleanPrice != train.MMRAcquisitionAuctionCleanPrice, 'MMRAcquisitionAuctionCleanPrice'] = train.VehBCost
    
    train = train.fillna(train.mode())
    
    return train

PCA, Oversampling, SelectModel, Scaling 전부 시도 해봤지만 결과는 좋지 못함

상관계수가 높은 건 학습에 악영향을 끼친다. 삭제 요망
변수들 중 서로 상관 관계가 높은 것들 중에도 하나씩 삭제하면 점수가 오르는 것 같다

중요도가 높은 변수끼리 파생 변수를 만들면 성능이 올라간다?

In [5]:
def featuring(data,is_test):
    train1 = data.copy()
    train1 = train1.drop(['VNZIP1','RefId','BYRNO','WheelTypeID'], axis=1)
    
    train1['BuyPerAvg'] = train1['VehBCost'] / (train1['MMRAcquisitionAuctionAveragePrice']+1)  # 평균 상태 경매 가격 대비
    train1['BuyPerClean'] = train1['VehBCost'] /  (train1['MMRAcquisitionAuctionCleanPrice']+1) # 좋은 상태 경매 가격 대비
    train1['BuyPerRetailAvg'] = train1['VehBCost'] / (train1['MMRAcquisitionRetailAveragePrice']+1) # 평균 상태 소매 가격 대비
    train1['BuyPerRetailClean'] = train1['VehBCost'] / (train1['MMRAcquisitonRetailCleanPrice']+1) # 좋은 상태 소매 가격 대비

    train1['AuctionPerRetailAvg'] = train1['MMRAcquisitionAuctionAveragePrice'] / (train1['MMRAcquisitionRetailAveragePrice']+1) # 평균 상태 경매와 소매 대비
    train1['AuctionPerRetailClean'] = train1['MMRAcquisitionAuctionCleanPrice'] / (train1['MMRAcquisitonRetailCleanPrice']+1) # 좋은 상태 경매와 소매 대비

    train1['BuyPerCurrActionAvg'] = train1['MMRAcquisitionAuctionAveragePrice'] / (train1['MMRCurrentAuctionAveragePrice']+1) # 과거 대비 현재
    train1['BuyPerCurrActionClean'] = train1['MMRAcquisitionAuctionCleanPrice'] / (train1['MMRCurrentAuctionCleanPrice']+1)
    train1['BuyPerCurrRetailAvg'] = train1['MMRAcquisitionRetailAveragePrice'] / (train1['MMRCurrentRetailAveragePrice']+1)
    train1['BuyPerCurrRetailClean'] = train1['MMRAcquisitonRetailCleanPrice'] / (train1['MMRCurrentRetailCleanPrice']+1)

    train1['VehOdoPerYears'] = train1['VehOdo'] / (train1['VehicleAge']+1)
    train1.loc[train1['VehOdoPerYears'] != train1['VehOdoPerYears'], 'VehOdoPerYears'] = train1['VehOdo']
    
    train1['PurchDate'] = pd.to_datetime(train1['PurchDate']) 
    train1['mon'] = train1['PurchDate'].dt.year.astype('str') + '/' + train1['PurchDate'].dt.month.astype('str')
    train1['year'] = train1['PurchDate'].dt.year.astype('str')
    
    # ===================================================== 0724
    avg = train1.groupby(['VNST','Model'])['VehBCost','WarrantyCost'].mean().reset_index()
    train1 = train1.merge(avg,how='left',on=['VNST','Model'],suffixes=['','PerVNST'])
    train1['WarrantPerVNSTPerc'] = train1['WarrantyCost'] / (train1['WarrantyCostPerVNST']+1)
    train1['VehBCostPerVNSTPerc'] = train1['VehBCost'] / (train1['VehBCostPerVNST']+1) 
    
    train1['WarrantyPerc'] = train1['WarrantyCost'] / train1['VehBCost']
    
    train1['WarrantyPerAuctionAvg'] = train1['WarrantyCost'] / (train1['MMRAcquisitionAuctionAveragePrice']+1)
    train1['WarrantyPerAuctionClean'] = train1['WarrantyCost'] / (train1['MMRAcquisitionAuctionCleanPrice']+1)
    train1['WarrantyPerRetailAvg'] = train1['WarrantyCost'] / (train1['MMRAcquisitionRetailAveragePrice']+1)
    train1['WarrantyPerRetailClean'] = train1['WarrantyCost'] / (train1['MMRAcquisitonRetailCleanPrice']+1)
     
    train1['VehOdoPerAuctionAvg'] = train1['VehOdo'] / (train1['MMRAcquisitionAuctionAveragePrice']+1)
    train1['VehOdoPerAuctionClean'] = train1['VehOdo'] / (train1['MMRAcquisitionAuctionCleanPrice']+1)
    train1['VehOdoPerRetailAvg'] = train1['VehOdo'] / (train1['MMRAcquisitionRetailAveragePrice']+1)
    train1['VehOdoPerRetailClean'] = train1['VehOdo'] / (train1['MMRAcquisitonRetailCleanPrice']+1)
     
    # ======================================================= 0727
    
    train1 = train1.drop(['WarrantyCostPerVNST','AuctionPerRetailAvg','BuyPerClean'],axis=1)
    
    train1['CurrAvgCleanMeanA'] = (train1['BuyPerCurrActionAvg'] + train1['BuyPerCurrActionClean']).mean()
    train1['CurrAvgCelanMeanR'] = (train1['BuyPerCurrRetailClean'] + train1['BuyPerCurrRetailAvg']).mean()
    
    train1['CurrAvgCleanDivA'] = train1['BuyPerCurrActionClean'] - train1['BuyPerCurrActionAvg']   # 과거 대비 현재 비율
    train1['CurrAvgCleanDivR'] = train1['BuyPerCurrRetailClean'] - train1['BuyPerCurrRetailAvg']
    
    train1['CurrAcReDivC'] = train1['BuyPerCurrRetailClean'] - train1['BuyPerCurrActionClean']
    train1['CurrAcReDivA'] = train1['BuyPerCurrRetailAvg'] - train1['BuyPerCurrActionAvg']
    
    train1['CurrAvgCleanPerA'] = train1['BuyPerCurrActionClean'] / (train1['BuyPerCurrActionAvg']+0.1) 
    train1['CurrAvgCleanPerR'] = train1['BuyPerCurrRetailClean'] / (train1['BuyPerCurrRetailAvg']+0.1)
    
    train1['CurrAcRePerC'] = train1['BuyPerCurrRetailClean'] / (train1['BuyPerCurrActionClean']+0.1)
    train1['CurrAcRePerA'] = train1['BuyPerCurrRetailAvg'] / (train1['BuyPerCurrActionAvg']+0.1)
    # ======================================================= 0728
    
    #avg = train1.groupby('Model')['CurrAvgCleanDivA','CurrAvgCleanDivR','CurrAcReDivC',
    #                              'CurrAcReDivA','CurrAvgCleanPerA','CurrAvgCleanPerR','CurrAcRePerC','CurrAcRePerA'].mean().reset_index()
    #train1 =  train1.merge(avg,how='left',on='Model',suffixes=['','_Modelmean'])
    
    # ======================================================== 0731
    
    train1['CurrBuyAvgMean'] = (train1['BuyPerCurrActionAvg'] + train1['BuyPerCurrRetailAvg']).mean()
    train1['CurrBuyCleanMean'] = (train1['BuyPerCurrRetailClean'] + train1['BuyPerCurrActionClean']).mean()
    
    
    train1['BuyCurrActionAvgMean'] = (train1['MMRAcquisitionAuctionAveragePrice'] + train1['MMRCurrentAuctionAveragePrice'])/2
    train1['BuyCurrActionCleanMean'] = (train1['MMRAcquisitionAuctionCleanPrice'] + train1['MMRCurrentAuctionCleanPrice'])/2
    train1['BuyCurrRetailAvgMean'] = (train1['MMRAcquisitionRetailAveragePrice'] + train1['MMRCurrentRetailAveragePrice'])/2
    train1['BuyCurrRetailCleanMean'] = (train1['MMRAcquisitonRetailCleanPrice'] + train1['MMRCurrentRetailCleanPrice'])/2
    
    
    # BuyPerCurrActionAvg,BuyPerCurrActionClean,BuyPerCurrRetailClean,BuyPerCurrRetailAvg 중요도 기여도 모두 높음
    # mon, PurchDate 는 빼면 성능 저하
    # WarrantyCostPerVNST는 안빼면 성능 저하
    #0.0093 ± 0.0022	WarrantyPerc # 15    보증 / 가격
    #0.0079 ± 0.0007	VehBCost #  18
    #0.0070 ± 0.0042	Trim # 19
    #0.0069 ± 0.0053	VehicleAge # 14
    #0.0062 ± 0.0020	VehOdo # 12
    #0.0057 ± 0.0009	VehYear # 13
    #0.0055 ± 0.0023	VehBCostPerVNSTPerc #  10    가격 / 주 평균 가격
    #0.0044 ± 0.0034	VehOdoPerYears # 8     주행거리 / 연식
    return train1

In [6]:
#train_df

In [7]:
train_df1 = preprocessing(train)
test_df1 = preprocessing(test)

100%|██████████| 152/152 [00:01<00:00, 79.94it/s]
100%|██████████| 18/18 [00:00<00:00, 33.86it/s]
100%|██████████| 7/7 [00:00<00:00, 30.84it/s]
100%|██████████| 101/101 [00:05<00:00, 17.14it/s]
100%|██████████| 127/127 [00:01<00:00, 112.81it/s]
100%|██████████| 16/16 [00:00<00:00, 48.18it/s]
100%|██████████| 5/5 [00:00<00:00, 43.61it/s]
100%|██████████| 82/82 [00:03<00:00, 24.32it/s]


In [8]:
#train_df.corr()[['CurrAvgCleanMeanA','CurrAvgCelanMeanR']]

#BuyPerCurrActionAvg                  0.989101
#BuyPerCurrRetailAvg                  0.979584
#BuyPerCurrRetailClean                0.981653

In [9]:
#train_df
#train_df.corr()['BuyPerCurrActionAvg']

#BuyPerCurrActionClean                0.989101
#BuyPerCurrRetailAvg                  0.985075
#BuyPerCurrRetailClean                0.988058

In [10]:
train_df = featuring(train_df1,False)
test_df = featuring(test_df1,True)

category = train_df.columns[train_df.dtypes == 'object'].to_list()

train_df['SubModel'] = train_df['SubModel'].astype('str')
test_df['SubModel'] = test_df['SubModel'].astype('str')

for col in category:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col].astype('str'))
    
    for c in test_df[col].unique():
        if c not in le.classes_:
            le.classes_ = np.append(le.classes_,c)
    
    test_df[col] = le.transform(test_df[col].astype('str'))
    
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')
    
y = train_df['IsBadBuy']
X = train_df.drop(['IsBadBuy'],axis=1)

X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=42)

In [11]:
#!pip install autogluon

In [12]:
#from autogluon.tabular import TabularDataset, TabularPredictor

#train_auto = TabularDataset(train_df)
#tab = TabularPredictor(label='IsBadBuy',eval_metric='f1')
#tab.fit(train_df)

In [13]:
#from sklearn.model_selection import GridSearchCV
#from catboost import CatBoostClassifier
#cat =  CatBoostClassifier(random_state=42,eval_metric='F1',cat_features=category)
#param = {
#    'learning_rate': [0.01,0.05,0.1,0.3],
#    'n_estimators': [500,1000,2000,3000,5000],
#    'max_depth': [4,6,8]
#}
#
#grid = GridSearchCV(estimator=cat,param_grid=param,n_jobs=-1,cv=2)
#grid.fit(X,y,verbose=100,eval_set=[(X_val,y_val)])
#print(grid.best_params_)

In [14]:
#print(X.dtypes)

In [15]:
category = X.columns[X.dtypes == 'category'].to_list()

cat = CatBoostClassifier(learning_rate=0.03,random_state=42, n_estimators=1000, max_depth=8,eval_metric='F1')  # learning_rate = 0.01, n_estimators=3000, max_depth=8
#cat.fit(X_train,y_train,eval_set=[(X_val,y_val)],cat_features=category,verbose=100,early_stopping_rounds=100) # 0.2692...
cat.fit(X,y,cat_features=category,verbose=100)

0:	learn: 0.0024474	total: 338ms	remaining: 5m 37s
100:	learn: 0.1664973	total: 25.2s	remaining: 3m 44s
200:	learn: 0.2240705	total: 52.5s	remaining: 3m 28s
300:	learn: 0.2502404	total: 1m 19s	remaining: 3m 4s
400:	learn: 0.2692746	total: 1m 47s	remaining: 2m 39s
500:	learn: 0.2867343	total: 2m 14s	remaining: 2m 13s
600:	learn: 0.3060201	total: 2m 41s	remaining: 1m 47s
700:	learn: 0.3229071	total: 3m 8s	remaining: 1m 20s
800:	learn: 0.3356465	total: 3m 36s	remaining: 53.7s
900:	learn: 0.3501720	total: 4m 4s	remaining: 26.8s
999:	learn: 0.3630482	total: 4m 31s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x78b1fc3cb3a0>

In [16]:
"""
def plot_feature_importance(importance,names,model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(8,8))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')
plot_feature_importance(cat.get_feature_importance(),X.columns,'CATBOOST')
"""


"\ndef plot_feature_importance(importance,names,model_type):\n    \n    feature_importance = np.array(importance)\n    feature_names = np.array(names)\n    \n    data={'feature_names':feature_names,'feature_importance':feature_importance}\n    fi_df = pd.DataFrame(data)\n    \n    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)\n\n    plt.figure(figsize=(8,8))\n\n    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])\n\n    plt.title(model_type + ' Feature Importance')\n    plt.xlabel('Feature Importance')\n    plt.ylabel('Feature Names')\nplot_feature_importance(cat.get_feature_importance(),X.columns,'CATBOOST')\n"

In [17]:
#import eli5
#from eli5.sklearn import PermutationImportance

#perm = PermutationImportance(cat, scoring = "f1", random_state = 42).fit(X_val,y_val)
#eli5.show_weights(perm,top=80,feature_names = X_val.columns.tolist())
#m = perm.feature_importances_ > 0
#X = X[X.columns[m]]
#X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=42)

In [18]:
p_cat = cat.predict(X_val)
print(f1_score(p_cat,y_val))

0.34447539461467036


In [19]:
sub = pd.read_csv('/kaggle/input/DontGetKicked/example_entry.csv')

In [20]:
pred = cat.predict_proba(test_df)[:,1]
sub['IsBadBuy'] = pred
sub.to_csv('submission_cat.csv',index=False)

In [21]:
test_df = test_df.drop('PurchDate',axis=1)

X = X.drop('PurchDate',axis=1)
X_train = X_train.drop('PurchDate',axis=1)
X_val = X_val.drop('PurchDate',axis=1)

In [22]:
"""
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

lgb =  LGBMClassifier(boost_from_average=False,random_state=42)
param = {
    'learning_rate': [0.01,0.05,0.1,0.3],
    'n_estimators': [500,1000,2000,3000,5000],
    'max_depth': [4,6,8]
}
grid = GridSearchCV(estimator=lgb,param_grid=param,n_jobs=-1,cv=2)
grid.fit(X,y,verbose=100,eval_set=[(X_val,y_val)],eval_metric='F1')
print(grid.best_params_)
"""

"\nfrom lightgbm import LGBMClassifier\nfrom sklearn.model_selection import GridSearchCV\n\nlgb =  LGBMClassifier(boost_from_average=False,random_state=42)\nparam = {\n    'learning_rate': [0.01,0.05,0.1,0.3],\n    'n_estimators': [500,1000,2000,3000,5000],\n    'max_depth': [4,6,8]\n}\ngrid = GridSearchCV(estimator=lgb,param_grid=param,n_jobs=-1,cv=2)\ngrid.fit(X,y,verbose=100,eval_set=[(X_val,y_val)],eval_metric='F1')\nprint(grid.best_params_)\n"

In [23]:
from xgboost import XGBClassifier
#xgb = XGBClassifier(random_state=42, learning_rate=0.01, max_depth=8, n_estimators=800,eval_metric='logloss',enable_categorical=True, tree_method='gpu_hist')
#xgb.fit(X_train,y_train,eval_set=[(X_val,y_val)],early_stopping_rounds=100,verbose=100)
#xgb.fit(X,y) # 0.3129

In [24]:
#p_xgb = xgb.predict(X_val)
#print(f1_score(p_xgb,y_val))

In [25]:
#pred_xgb = xgb.predict_proba(test_df)[:,1]
#sub3 = sub.copy()
#sub3['IsBadBuy'] = pred_xgb

In [26]:
lgb = LGBMClassifier(boost_from_average = False,random_state=42, n_estimators= 2000, learning_rate=0.01, max_depth=4)#,learning_rate=0.1)
#lgb.fit(X_train,y_train,eval_set=[(X_val,y_val)], eval_metric='F1',verbose=100, early_stopping_rounds=100) # {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 2000}
lgb.fit(X,y) # 0.3108

In [27]:
p_lgb = lgb.predict(X_val)
print(f1_score(p_lgb,y_val))

0.344731977818854


In [28]:
pred_light = lgb.predict_proba(test_df)[:,1]
sub1 = sub.copy()
sub1['IsBadBuy'] = pred_light
#sub1.to_csv('submission_lgb.csv',index=False)

In [29]:
#sub2 = sub.copy()
#sub2['IsBadBuy'] = sub['IsBadBuy'] * 0.8 + sub3['IsBadBuy'] * 0.2
#sub2.to_csv('sub_cat8_xg2.csv',index=False)

In [30]:
sub2 = sub.copy()
sub2['IsBadBuy'] = sub['IsBadBuy'] * 0.8 + sub1['IsBadBuy'] * 0.2
sub2.to_csv('sub_cat8_lgb2.csv',index=False)

In [31]:
#sub4 = sub.copy()
#sub4['IsBadBuy'] = sub['IsBadBuy'] * 0.8 + sub1['IsBadBuy'] * 0.1 + sub3['IsBadBuy'] * 0.1
#sub4.to_csv('sub_cat8_lgb1_xgb1.csv',index=False)

In [32]:
#sub2 = sub.copy()
#sub2['IsBadBuy'] = sub['IsBadBuy'] * 0.9 + sub1['IsBadBuy'] * 0.1
#sub2.to_csv('sub_cat_9.csv',index=False)