In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy as sc
import seaborn as sbn

df=pd.read_csv('trainimp.csv')
df_test=pd.read_csv('testimp.csv')

In [2]:
X_train = df.drop("SalePrice", axis=1)
y_train = df.SalePrice

X_test = df_test.copy()

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from catboost import CatBoostRegressor


In [5]:
## FOR CLASSIFICATION
from sklearn.metrics import precision_recall_curve,auc,fbeta_score
from sklearn.model_selection import StratifiedKFold


def get_scores(model,X,y):
  pred=np.round(model.predict(X))
  probs=model.predict_proba(X)[:,1]
  precision,recall,_=precision_recall_curve(y,probs)
  accu=accuracy_score(y,pred)
  pr_auc=auc(recall,precision)
  f2=fbeta_score(y,pred,beta=2)
  return pred,accu,pr_auc,f2


#Train model with KFold cross-validation

def train_model(model,X,y):
  accu_list,pr_auc_list,f2_list=[],[],[]
  kf=StratifiedKFold(n_splits=5,shuffle=False)
  for train,val in kf.split(X,y):
    X_train,y_train=X[train],y[train]
    X_val,y_val=X[val],y[val]
    model.fit(X_train,y_train)
    _,accu,pr_auc,f2=get_scores(model,X_val,y_val)
    accu_list.append(accu)
    pr_auc_list.append(pr_auc)
    f2_list.append(f2)
  print(f'Training Accuracy: {np.mean(accu_list):.3f}')
  print(f'Training PR_AUC: {np.mean(pr_auc_list):.3f}')
  print(f'Training F2: {np.mean(f2_list):.3f}')
  return model



In [17]:
from sklearn.feature_selection import RFE
folds = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

# step-2: specify range of hyperparameters to tune
hyper_params = [{

    
}]


# step-3: perform grid search
# 3.1 specify model
lm = CatBoostRegressor()
lm.fit(X_train, y_train)
rfe = RFE(lm)             

# 3.2 call GridSearchCV()
clf = GridSearchCV(estimator = rfe, 
                        param_grid = hyper_params, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True)      

# fit the model
clf.fit(X_train, y_train)                  

ValueError: Setting a random_state has no effect since shuffle is False. You should leave random_state to its default (None), or set shuffle=True.

In [16]:
y_pred = clf.predict(X_test_scaled)
submission = pd.read_csv("sample_submission.csv")
submission["SalePrice"] = y_pred[:1459]
submission.to_csv('submission.csv', index=False)



In [14]:
!kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m ""

Successfully submitted to House Prices - Advanced Regression Techniques



  0%|          | 0.00/34.4k [00:00<?, ?B/s]
 23%|██▎       | 8.00k/34.4k [00:00<00:00, 70.8kB/s]
100%|██████████| 34.4k/34.4k [00:01<00:00, 21.1kB/s]
