In [2]:
run -i import_file.py

## 준영 BEST 1

In [4]:
X_train  = pd.read_csv('X_train1.csv',encoding='cp949').drop(columns='Unnamed: 0')
X_test  = pd.read_csv('X_test1.csv',encoding='cp949').drop(columns='Unnamed: 0')
y_train = pd.read_csv('y_train.csv')['Salary']

### CATBOOST

In [6]:
categorical_features = list(X_train.select_dtypes(include="object").columns)
numeric_features = list(X_test.select_dtypes(exclude="object").columns)

In [7]:
def get_stacking_data(model, X_train, y_train, X_test, n_folds=5):   
  kfold = KFold(n_splits = n_folds,shuffle=True, random_state = 20)
  

  train_fold_predict = np.zeros((X_train.shape[0],1))

  test_predict = np.zeros((X_test.shape[0], n_folds))
  print('model : ',model.__class__.__name__)
  
  for cnt, (train_index, valid_index) in enumerate(kfold.split(X_train)):
    X_train_ = X_train.iloc[train_index]
    y_train_ = y_train.iloc[train_index]
    X_valid = X_train.iloc[valid_index]

    # 학습
    model.fit(X_train_,y_train_)
    # 해당 폴드에서 학습된 모델에다가 검증 데이터 (X_valid)로 예측 후 저장
    train_fold_predict[valid_index,:] = model.predict(X_valid).reshape(-1,1)
    # 해당 폴드에서 생성된 모델에게 원본 테스트 데이터 (X_test)를 이용해서 예측하고 저장
    test_predict[:,cnt] = model.predict(X_test)
  
  # for 문이 끝나면 test_pred는 평균을 내서 하나로 합친다.
  test_predict_mean = np.mean(test_predict, axis=1).reshape(-1,1)
  
  return train_fold_predict, test_predict_mean

In [8]:
model = CatBoostRegressor(cat_features=categorical_features, verbose=False ,random_state=0)

In [9]:
cat_train , cat_test = get_stacking_data(model, X_train, y_train, X_test)

model :  CatBoostRegressor


In [10]:
cat_data = pd.concat([pd.DataFrame(cat_train), pd.DataFrame(cat_test)],axis=1)
cat_data.columns = ['modelC_cat_train','modelC_cat_test']
cat_data.to_csv('modelC_cat_data.csv')

### LGBM

In [11]:
def get_stacking_data(model, X_train, y_train, X_test, n_folds=5):    
  kfold = KFold(n_splits = n_folds,shuffle=True, random_state = 1004)
  

  train_fold_predict = np.zeros((X_train.shape[0],1))

  test_predict = np.zeros((X_test.shape[0], n_folds))
  print('model : ',model.__class__.__name__)
  
  for cnt, (train_index, valid_index) in enumerate(kfold.split(X_train)):
    X_train_ = X_train.iloc[train_index]
    y_train_ = y_train.iloc[train_index]
    X_valid = X_train.iloc[valid_index]

    # 학습
    model.fit(X_train_,y_train_)
    # 해당 폴드에서 학습된 모델에다가 검증 데이터 (X_valid)로 예측 후 저장
    train_fold_predict[valid_index,:] = model.predict(X_valid).reshape(-1,1)
    # 해당 폴드에서 생성된 모델에게 원본 테스트 데이터 (X_test)를 이용해서 예측하고 저장
    test_predict[:,cnt] = model.predict(X_test)
  
  # for 문이 끝나면 test_pred는 평균을 내서 하나로 합친다.
  test_predict_mean = np.mean(test_predict, axis=1).reshape(-1,1)
  
  return train_fold_predict, test_predict_mean

In [12]:
# X_train.columns = list(map(str,X_train.columns))
# X_test.columns = list(map(str,X_test.columns))
# X_train.columns = X_train.columns+'temp'
# X_test.columns = X_test.columns+'temp'
categorical_features = list(X_train.select_dtypes(include="object").columns)
numeric_features = list(X_test.select_dtypes(exclude="object").columns)
X_train = X_train[numeric_features+categorical_features]
X_test = X_test[numeric_features+categorical_features]

In [13]:
def remove_outlier(X, q=0.05):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(q), x.quantile(1-q)), axis=0).values

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
#         ("outlier", FunctionTransformer(remove_outlier, kw_args={'q':0.05})),
#         ("scaler", PowerTransformer()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown='ignore', dtype=int)),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer),
        ("selector", SelectPercentile(percentile=100)),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LGBMRegressor(random_state=0)),
    ]
)

set_config(display="diagram")  # To view the text pipeline, change to display='text'.
model

In [14]:
lgbm_train, lgbm_test = get_stacking_data(model, X_train, y_train, X_test)

model :  Pipeline


In [15]:
lgbm_data = pd.concat([pd.DataFrame(lgbm_train), pd.DataFrame(lgbm_test)],axis=1)
lgbm_data.columns = ['modelC_lgbm_train','modelC_lgbm_test']
lgbm_data.to_csv('modelC_lgbm_data.csv')

In [16]:
from sklearn.model_selection import cross_val_score, ShuffleSplit

sscv = ShuffleSplit(test_size=0.3334, n_splits=5,random_state=0)
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=sscv)

print("Default LM CV scores: ", np.sqrt(-1*scores))
print("Default LM CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))

Default LM CV scores:  [837.29305265 822.68512059 848.10438903 841.73869491 827.7603641 ]
Default LM CV mean = 835.57 with std = 124.10


### Ridge

In [17]:
def get_stacking_data(model, X_train, y_train, X_test, n_folds=5):      
  kfold = KFold(n_splits = n_folds,shuffle=True, random_state = 25)
  

  train_fold_predict = np.zeros((X_train.shape[0],1))

  test_predict = np.zeros((X_test.shape[0], n_folds))
  print('model : ',model.__class__.__name__)
  
  for cnt, (train_index, valid_index) in enumerate(kfold.split(X_train)):
    X_train_ = X_train.iloc[train_index]
    y_train_ = y_train.iloc[train_index]
    X_valid = X_train.iloc[valid_index]

    # 학습
    model.fit(X_train_,y_train_)
    # 해당 폴드에서 학습된 모델에다가 검증 데이터 (X_valid)로 예측 후 저장
    train_fold_predict[valid_index,:] = model.predict(X_valid).reshape(-1,1)
    # 해당 폴드에서 생성된 모델에게 원본 테스트 데이터 (X_test)를 이용해서 예측하고 저장
    test_predict[:,cnt] = model.predict(X_test)
  
  # for 문이 끝나면 test_pred는 평균을 내서 하나로 합친다.
  test_predict_mean = np.mean(test_predict, axis=1).reshape(-1,1)
  
  return train_fold_predict, test_predict_mean

In [18]:
def remove_outlier(X, q=0.05):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(q), x.quantile(1-q)), axis=0).values

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
#         ("outlier", FunctionTransformer(remove_outlier, kw_args={'q':0.05})),
#         ("scaler", PowerTransformer()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown='ignore', dtype=int)),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer),
        ("selector", SelectPercentile(percentile=100)),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", Ridge(alpha=1.0)),
    ]
)

set_config(display="diagram")  # To view the text pipeline, change to display='text'.
model

In [19]:
ridge_train, ridge_test = get_stacking_data(model, X_train, y_train, X_test)

model :  Pipeline


In [20]:
ridge_data = pd.concat([pd.DataFrame(ridge_train), pd.DataFrame(ridge_test)],axis=1)
ridge_data.columns = ['modelC_ridge_train','modelC_ridge_test']
ridge_data.to_csv('modelC_ridge_data.csv')

In [21]:
from sklearn.model_selection import cross_val_score, ShuffleSplit

sscv = ShuffleSplit(test_size=0.3334, n_splits=5,random_state=0)
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=sscv)

print("Default LM CV scores: ", np.sqrt(-1*scores))
print("Default LM CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))

Default LM CV scores:  [874.79451406 862.40047736 881.82715962 878.14297538 865.96435707]
Default LM CV mean = 872.66 with std = 113.07


## 준영 BEST 2

In [29]:
X_train  = pd.read_csv('X_train2.csv',encoding='cp949').drop(columns='Unnamed: 0')
X_test = pd.read_csv('X_test2.csv',encoding='cp949').drop(columns='Unnamed: 0')
y_train = pd.read_csv('y_train.csv')['Salary']

In [30]:
X_test['직무태그'] = X_test['직무태그'].fillna('없음')
X_test['첫직무태그'] = X_test['첫직무태그'].fillna('없음')

### CATBOOST

In [35]:
def get_stacking_data(model, X_train, y_train, X_test, n_folds=5):      
  kfold = KFold(n_splits = n_folds,shuffle=True, random_state = 27)
  

  train_fold_predict = np.zeros((X_train.shape[0],1))

  test_predict = np.zeros((X_test.shape[0], n_folds))
  print('model : ',model.__class__.__name__)
  
  for cnt, (train_index, valid_index) in enumerate(kfold.split(X_train)):
    X_train_ = X_train.iloc[train_index]
    y_train_ = y_train.iloc[train_index]
    X_valid = X_train.iloc[valid_index]

    # 학습
    model.fit(X_train_,y_train_)
    # 해당 폴드에서 학습된 모델에다가 검증 데이터 (X_valid)로 예측 후 저장
    train_fold_predict[valid_index,:] = model.predict(X_valid).reshape(-1,1)
    # 해당 폴드에서 생성된 모델에게 원본 테스트 데이터 (X_test)를 이용해서 예측하고 저장
    test_predict[:,cnt] = model.predict(X_test)
  
  # for 문이 끝나면 test_pred는 평균을 내서 하나로 합친다.
  test_predict_mean = np.mean(test_predict, axis=1).reshape(-1,1)
  
  return train_fold_predict, test_predict_mean

In [36]:
categorical_features = list(X_train.select_dtypes(include="object").columns)
numeric_features = list(X_test.select_dtypes(exclude="object").columns)
model = CatBoostRegressor(cat_features=categorical_features, verbose=False ,random_state=0)

In [37]:
cat_train , cat_test = get_stacking_data(model, X_train, y_train, X_test)

model :  CatBoostRegressor


In [38]:
cat_data = pd.concat([pd.DataFrame(cat_train), pd.DataFrame(cat_test)],axis=1)
cat_data.columns = ['modelD_cat_train','modelD_cat_test']
cat_data.to_csv('modelD_cat_data.csv')

### LGBM

In [39]:
# X_train.columns = list(map(str,X_train.columns))
# X_test.columns = list(map(str,X_test.columns))
# X_train.columns = X_train.columns+'temp'
# X_test.columns = X_test.columns+'temp'
categorical_features = list(X_train.select_dtypes(include="object").columns)
numeric_features = list(X_test.select_dtypes(exclude="object").columns)
X_train = X_train[numeric_features+categorical_features]
X_test = X_test[numeric_features+categorical_features]

In [None]:
def get_stacking_data(model, X_train, y_train, X_test, n_folds=5):      # 위에서 설명한 3-fold stacking 입니당
  kfold = KFold(n_splits = n_folds,shuffle=True, random_state = 27)
  

  train_fold_predict = np.zeros((X_train.shape[0],1))

  test_predict = np.zeros((X_test.shape[0], n_folds))
  print('model : ',model.__class__.__name__)
  
  for cnt, (train_index, valid_index) in enumerate(kfold.split(X_train)):
    X_train_ = X_train.iloc[train_index]
    y_train_ = y_train.iloc[train_index]
    X_valid = X_train.iloc[valid_index]

    # 학습
    model.fit(X_train_,y_train_)
    # 해당 폴드에서 학습된 모델에다가 검증 데이터 (X_valid)로 예측 후 저장
    train_fold_predict[valid_index,:] = model.predict(X_valid).reshape(-1,1)
    # 해당 폴드에서 생성된 모델에게 원본 테스트 데이터 (X_test)를 이용해서 예측하고 저장
    test_predict[:,cnt] = model.predict(X_test)
  
  # for 문이 끝나면 test_pred는 평균을 내서 하나로 합친다.
  test_predict_mean = np.mean(test_predict, axis=1).reshape(-1,1)
  
  return train_fold_predict, test_predict_mean

In [43]:
def remove_outlier(X, q=0.05):  
    df = pd.DataFrame(X)
    return df.apply(lambda x: x.clip(x.quantile(q), x.quantile(1-q)), axis=0).values

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
#         ("outlier", FunctionTransformer(remove_outlier, kw_args={'q':0.05})),
#         ("scaler", PowerTransformer()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown='ignore', dtype=int)),
    ]
)

column_transformer = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

preprocessor = Pipeline(
    steps=[
        ("column", column_transformer),
        ("selector", SelectPercentile(percentile=100)),
    ]
)

model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", LGBMRegressor(random_state=0)),
    ]
)

set_config(display="diagram")  # To view the text pipeline, change to display='text'.
model

In [41]:
lgbm_train, lgbm_test = get_stacking_data(model, X_train, y_train, X_test)

model :  Pipeline


In [42]:
lgbm_data = pd.concat([pd.DataFrame(lgbm_train), pd.DataFrame(lgbm_test)],axis=1)
lgbm_data.columns = ['modelD_lgbm_train','modelD_lgbm_test']
lgbm_data.to_csv('modelD_lgbm_data.csv')

In [44]:
from sklearn.model_selection import cross_val_score, ShuffleSplit

sscv = ShuffleSplit(test_size=0.3334, n_splits=5,random_state=0)
scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_squared_error', cv=sscv)

print("Default LM CV scores: ", np.sqrt(-1*scores))
print("Default LM CV mean = %.2f" % np.sqrt(-1*scores.mean()), "with std = %.2f" % np.sqrt(scores.std()))

Default LM CV scores:  [848.59829142 834.45323841 857.62415867 844.47914948 834.11045214]
Default LM CV mean = 843.90 with std = 122.63
