In [2]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Common imports for data
import numpy as np
import pandas as pd
import os

# for metrics, preprocessing
import sklearn


# models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot 
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt


In [3]:
from google.colab import drive
drive.mount("/content/gdrive")
#download the data
os.chdir('/content/gdrive/My Drive/Data_Edv_P_P5/')
os.listdir()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


['Property_test_share.csv', 'Property_train.csv']

In [4]:
import numpy as np
import pandas as pd
train=pd.read_csv("Property_train.csv")
test=pd.read_csv('Property_test_share.csv')

In [5]:
train=pd.read_csv("Property_train.csv", na_values=["missing"])
test=pd.read_csv('Property_test_share.csv',na_values=["missing"])

In [6]:
train.columns
df=train.copy()

In [7]:
abs(df.corr()["Junk"]).sort_values(ascending=False)

Junk                     1.000000
PropertyAge              0.166533
BuildYear                0.158773
PriceIndex5              0.107274
PriceIndex1              0.106897
PriceIndex7              0.102728
PriceIndex6              0.102215
PriceIndex2              0.100668
PriceIndex8              0.098974
PriceIndex9              0.097962
PriceIndex3              0.086481
PriceIndex4              0.082716
NormalisedPopulation     0.079284
InsurancePremiumIndex    0.055588
Zip                      0.005595
ExpeditedListing         0.004631
Name: Junk, dtype: float64

In [8]:
price_cols=["PriceIndex1","PriceIndex2","PriceIndex3","PriceIndex4","PriceIndex5",
            "PriceIndex6","PriceIndex7","PriceIndex8","PriceIndex9","NormalisedPopulation",
            "InsurancePremiumIndex","PropertyAge","BuildYear"]

In [9]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

  import pandas.util.testing as tm


In [10]:
df_num=train[price_cols].copy()
df_new=df_num[df_num["PriceIndex8"].isnull()==df_num["PriceIndex9"].isnull()]
df_new=df_new.drop(["PriceIndex6","PriceIndex4","PriceIndex8","PriceIndex1",
                    "PriceIndex7","PriceIndex2","BuildYear","PriceIndex5","PriceIndex3",
                    "NormalisedPopulation"],axis=1)
calc_vif(df_new)

Unnamed: 0,variables,VIF
0,PriceIndex9,5.077907
1,InsurancePremiumIndex,5.603358
2,PropertyAge,5.363255


### Column Transformer

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

ListDate_ix = 0

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_Adow_sin = True): # no *args or **kargs
        self.add_Adow_sin = add_Adow_sin
        
    def fit(self, X, y=None):
        return self # nothing else to do 
    
    def transform(self, X):
        Ldate = pd.to_datetime(X[:,ListDate_ix], format='%m/%d/%Y')
        
        L_day=Ldate.day
        L_dofweek=Ldate.dayofweek
        L_month=Ldate.month
        
        sin_L_day= np.sin(2 * np.pi * (L_day-L_day.min())/(L_day.max()-L_day.min()))
        cos_L_day= np.sin(2 * np.pi * (L_day-L_day.min())/(L_day.max()-L_day.min()))
        
        sin_L_dofweek = np.sin(2 * np.pi * (L_dofweek-L_dofweek.min())/(L_dofweek.max()-L_dofweek.min()))
        cos_L_dofweek = np.cos(2 * np.pi * (L_dofweek-L_dofweek.min())/(L_dofweek.max()-L_dofweek.min()))
        
        sin_L_month = np.sin(2 * np.pi * (L_month-L_month.min())/(L_month.max()-L_month.min()))
        cos_L_month = np.cos(2 * np.pi * (L_month-L_month.min())/(L_month.max()-L_month.min()))
        
        return np.c_[sin_L_month,cos_L_month]

attr_adder = CombinedAttributesAdder()

### Date.Month 

In [12]:
# date cols
# for second attempt i opened std scaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

date_cols=["ListDate"]
df_train_dates=train[date_cols].copy()

date_pipeline = Pipeline([            
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('attribs_adder', CombinedAttributesAdder()),
    ('std_scaler', StandardScaler()),
])

df_train_dates_tr = date_pipeline.fit_transform(df_train_dates)
print(df_train_dates_tr.shape)
calc_vif(pd.DataFrame(df_train_dates_tr))

(62035, 2)


Unnamed: 0,variables,VIF
0,0,1.000027
1,1,1.000027


### Num.pipeline

In [13]:
# numcols
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_cols=["PriceIndex9","InsurancePremiumIndex"]#,"PropertyAge"]
df_train_num=train[num_cols].copy()

num_pipeline = Pipeline([            
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('std_scaler', StandardScaler()),
])

df_train_num_tr = num_pipeline.fit_transform(df_train_num)
df_train_num_tr.shape

(62035, 2)

## Binary pipeline

# not using 
bin_cols=["Region","ExpeditedListing"]

df_bincols=train[bin_cols].copy()

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
impute=SimpleImputer(strategy="most_frequent")

binary_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ("binary_cols",OneHotEncoder(drop="first")),
])

df_train_binary_tr=binary_pipeline.fit_transform(df_bincols)
print(df_train_binary_tr.shape)

In [14]:
train.columns

Index(['Junk', 'InteriorsStyle', 'PriceIndex8', 'ListDate', 'Material',
       'PriceIndex9', 'Agency', 'AreaIncomeType', 'EnvRating', 'PriceIndex7',
       'ExpeditedListing', 'PriceIndex4', 'PriceIndex1', 'PriceIndex6',
       'PRIMEUNIT', 'Channel', 'Zip', 'InsurancePremiumIndex', 'PlotType',
       'Architecture', 'PriceIndex3', 'Region', 'PriceIndex5', 'SubModel',
       'Facade', 'State', 'NormalisedPopulation', 'BuildYear', 'RegionType',
       'PropertyAge', 'PriceIndex2'],
      dtype='object')

## Cat Pipeline

In [15]:
cat_cols=['EnvRating','PRIMEUNIT','PropertyAge',"Region","ExpeditedListing",'Facade','State',"RegionType",
          "Agency","Channel","Architecture","PlotType"]

df_catcols=train[cat_cols].copy()

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
impute=SimpleImputer(strategy="constant")

cats_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="constant",fill_value='missing')),
    ("cat_cols",OneHotEncoder()),
])

cats_pipeline.fit_transform(df_catcols)

<62035x134 sparse matrix of type '<class 'numpy.float64'>'
	with 744420 stored elements in Compressed Sparse Row format>

## FullPipeline

In [16]:
from sklearn.compose import ColumnTransformer 

date_attribs =["ListDate"]
num_attribs = num_cols
cat_attribs = cat_cols
#binary_attribs=bin_cols
     
full_pipeline = ColumnTransformer([
    ("date.time", date_pipeline, date_attribs), #6
    ("num_cols", num_pipeline, num_attribs),    # 11
    #("binary_cols",binary_pipeline,binary_attribs),#2
    ("cat_cols", cats_pipeline, cat_attribs), #110
])


In [17]:
df=train.copy()
full_pipeline.fit_transform(df)

<62035x138 sparse matrix of type '<class 'numpy.float64'>'
	with 992560 stored elements in Compressed Sparse Row format>

## Prepare Data

In [18]:
# for second attempt i opened std scaler
X_train = train.drop(["Junk"],axis=1).copy()
y_train = train["Junk"].copy()

X_test = test.copy()

In [19]:
X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)

X_train_prepared.shape, X_test_prepared.shape

((62035, 138), (10948, 138))

In [22]:
## Testing on Models

In [23]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state=0,max_iter=1000)
log_reg.fit(X_train_prepared, y_train)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, X_train_prepared, y_train,scoring="roc_auc", cv=10)
print("roc-auc:  ",scores)

roc-auc:   [0.65960316 0.67226111 0.69328401 0.68457238 0.68468887 0.66309964
 0.67472115 0.69461875 0.68246795 0.68559678]


In [24]:
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train_prepared, y_train)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(dt_clf, X_train_prepared, y_train,scoring="roc_auc", cv=10)
print("roc-auc:  ",scores)

roc-auc:   [0.53090486 0.5370374  0.5344405  0.53454002 0.53958852 0.53433611
 0.54051873 0.53646052 0.54307984 0.53070445]


In [25]:
from sklearn.ensemble import GradientBoostingClassifier
gbm_clf=GradientBoostingClassifier(loss="deviance")

gbm_clf.fit(X_train_prepared, y_train)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(gbm_clf, X_train_prepared, y_train,scoring="roc_auc", cv=10)
print("roc-auc:  ",scores)

roc-auc:   [0.66119787 0.68328314 0.69758244 0.69012364 0.68990527 0.6677524
 0.68333591 0.70234862 0.68415384 0.6853791 ]


In [27]:
display(train["Junk"].value_counts()/len(train))
0.877456/0.122544

0    0.877456
1    0.122544
Name: Junk, dtype: float64

7.160334247290769

In [28]:
xgb_clf=XGBClassifier(scale_pos_weight=7,max_delta_step=2)

xgb_clf.fit(X_train_prepared, y_train)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgb_clf, X_train_prepared, y_train,scoring="roc_auc", cv=10)
print("roc-auc:  ",scores)

roc-auc:   [0.65937331 0.68044648 0.69831889 0.69155817 0.68876709 0.66882415
 0.6855934  0.70325877 0.68222065 0.68949846]


In [30]:
xgb_clf=XGBClassifier(max_delta_step=2, n_estimators=1000,scale_pos_weight=3 )

xgb_clf.fit(X_train_prepared, y_train)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgb_clf, X_train_prepared, y_train,scoring="roc_auc", cv=10)
print("roc-auc:  ",scores)

roc-auc:   [0.661969   0.68171732 0.69925884 0.6911393  0.69496826 0.6746531
 0.68483627 0.70137683 0.68860427 0.69128129]


In [31]:
xgb_clf=XGBClassifier(max_delta_step=2, n_estimators=1000,scale_pos_weight=3 )

xgb_clf.fit(X_train_prepared, y_train)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgb_clf, X_train_prepared, y_train,scoring="roc_auc", cv=10)
print("roc-auc:  ",scores)

roc-auc:   [0.66336648 0.67772608 0.7009466  0.689607   0.68917014 0.67217672
 0.68776906 0.69976551 0.68894935 0.69139165]


In [20]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidate = np.flatnonzero(results['rank_test_score'] == i)[0]
        print("Model with rank: {0}".format(i))
        print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
            results['mean_test_score'][candidate],
            results['std_test_score'][candidate]))
        print("Parameters: {0}".format(results['params'][candidate]))
        print("")

from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgb_params = {"n_estimators":[100,500,1000,1500],}

xgb1=XGBClassifier(subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)

grid_search=GridSearchCV(xgb1,cv=5,param_grid=xgb_params,

scoring='roc_auc',verbose=False,n_jobs=-1)

grid_search.fit(X_train_prepared, y_train)

report(grid_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.69065 (std: 0.00928)
Parameters: {'n_estimators': 500}

Model with rank: 2
Mean validation score: 0.68797 (std: 0.00875)
Parameters: {'n_estimators': 1000}

Model with rank: 3
Mean validation score: 0.68556 (std: 0.00920)
Parameters: {'n_estimators': 100}



In [21]:
xgb_params={"gamma":[0,2,5,8,10],
            "max_depth": [2,3,4,5,6,7,8],
            "min_child_weight":[0.5,1,2,5,10]
}

xgb2=XGBClassifier(n_estimators=500,subsample=0.8,
                   colsample_bylevel=0.8,colsample_bytree=0.8)

random_search=RandomizedSearchCV(xgb2,param_distributions=xgb_params,n_iter=10,
                                 cv=5,scoring='roc_auc',
                                 n_jobs=-1,verbose=False)

random_search.fit(X_train_prepared, y_train)
report(random_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.69183 (std: 0.00973)
Parameters: {'min_child_weight': 1, 'max_depth': 6, 'gamma': 8}

Model with rank: 2
Mean validation score: 0.69155 (std: 0.00982)
Parameters: {'min_child_weight': 1, 'max_depth': 5, 'gamma': 10}

Model with rank: 3
Mean validation score: 0.69133 (std: 0.00926)
Parameters: {'min_child_weight': 2, 'max_depth': 6, 'gamma': 8}



In [22]:
xgb_params={
    'max_delta_step':[0,1,3,6,10],
    'scale_pos_weight':[1,2,3,4]
}

xgb3=XGBClassifier(n_estimators=500,min_child_weight=1,gamma=8,max_depth=6,
                  subsample=0.8,colsample_bylevel=0.8,colsample_bytree=0.8)

from sklearn.model_selection import GridSearchCV

grid_search=GridSearchCV(xgb3,param_grid=xgb_params,
                         cv=5,scoring='roc_auc',n_jobs=-1,verbose=False)
grid_search.fit(X_train_prepared, y_train)
report(grid_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.69224 (std: 0.00872)
Parameters: {'max_delta_step': 1, 'scale_pos_weight': 1}

Model with rank: 2
Mean validation score: 0.69194 (std: 0.00985)
Parameters: {'max_delta_step': 3, 'scale_pos_weight': 1}

Model with rank: 3
Mean validation score: 0.69183 (std: 0.00973)
Parameters: {'max_delta_step': 0, 'scale_pos_weight': 1}



In [25]:

xgb_params={'subsample': [i/10 for i in range(5,11)],
            'colsample_bytree':[i/10 for i in range(5,11)],
            'colsample_bylevel':[i/10 for i in range(5,11)]
            }

xgb4=XGBClassifier(n_estimators=500,min_child_weight=1,gamma=6,max_depth=8, scale_pos_weight=1,max_delta_step=1)

random_search=RandomizedSearchCV(xgb4,param_distributions=xgb_params,
                                 cv=5,n_iter=7,scoring='roc_auc',
                                n_jobs=-1,verbose=False)

random_search.fit(X_train_prepared, y_train)
report(random_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.69204 (std: 0.00937)
Parameters: {'subsample': 1.0, 'colsample_bytree': 0.6, 'colsample_bylevel': 0.8}

Model with rank: 2
Mean validation score: 0.69203 (std: 0.00976)
Parameters: {'subsample': 1.0, 'colsample_bytree': 0.6, 'colsample_bylevel': 1.0}

Model with rank: 3
Mean validation score: 0.69182 (std: 0.00932)
Parameters: {'subsample': 1.0, 'colsample_bytree': 1.0, 'colsample_bylevel': 0.9}



In [27]:
xgb4

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=6,
              learning_rate=0.1, max_delta_step=1, max_depth=8,
              min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [29]:

xgb_params={"reg_lambda":[i/10 for i in range(0,50)],
            'reg_alpha':[i/10 for i in range(0,50)]
            }

xgb5=XGBClassifier(n_estimators=500,min_child_weight=1,gamma=6,max_depth=8, scale_pos_weight=1,max_delta_step=1,
                   subsample=1.0, colsample_bytree=0.6, colsample_bylevel=0.8)

random_search=RandomizedSearchCV(xgb5,param_distributions=xgb_params,
                                 cv=5,n_iter=10,scoring='roc_auc',
                                 n_jobs=-1,verbose=False)

random_search.fit(X_train_prepared, y_train)

report(random_search.cv_results_,3)

Model with rank: 1
Mean validation score: 0.69346 (std: 0.00975)
Parameters: {'reg_lambda': 0.1, 'reg_alpha': 2.2}

Model with rank: 2
Mean validation score: 0.69287 (std: 0.00870)
Parameters: {'reg_lambda': 2.3, 'reg_alpha': 1.0}

Model with rank: 3
Mean validation score: 0.69250 (std: 0.00944)
Parameters: {'reg_lambda': 0.3, 'reg_alpha': 3.5}



In [30]:
final_xgb=XGBClassifier(n_estimators=500,min_child_weight=1,gamma=6,max_depth=8, scale_pos_weight=1,max_delta_step=1,
                        subsample=1.0, colsample_bytree=0.6, colsample_bylevel=0.8,reg_lambda=0.1,reg_alpha=2.2)

final_xgb.fit(X_train_prepared, y_train)

from sklearn.model_selection import cross_val_score
scores = cross_val_score(final_xgb, X_train_prepared, y_train,scoring="roc_auc", cv=10)
print("roc-auc:  ",scores)

roc-auc:   [0.67355357 0.68781227 0.70585012 0.6992267  0.69866914 0.67615685
 0.69163145 0.70860726 0.69127694 0.69505461]


In [33]:
y_test_pred=final_xgb.predict_proba(X_test_prepared)[:,1]

mysubmission=pd.DataFrame(y_test_pred,columns=["Junk"])
pd.DataFrame(mysubmission).to_csv("Submi_P5_EdvPy_KSV_Xgb_S05.csv",index=False)

In [34]:
from google.colab import files
files.download("Submi_P5_EdvPy_KSV_Xgb_S05.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>