In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/snap-ml-pipeline/logs.log
/kaggle/input/snap-ml-pipeline/__results__.html
/kaggle/input/snap-ml-pipeline/__notebook__.ipynb
/kaggle/input/snap-ml-pipeline/best
/kaggle/input/snap-ml-pipeline/__output__.json
/kaggle/input/snap-ml-pipeline/custom.css


# Open the best features dataset

In [2]:
df = pd.read_csv('../input/snap-ml-pipeline/best')
df

Unnamed: 0,numero_disposition,valeur_fonciere,surface_terrain,longitude,latitude,adresse_numero,nom_commune
0,1,900000.0,500.0,2.587084,48.830320,9003.0,11020
1,1,250000.0,500.0,2.292128,48.867776,36.0,7060
2,1,2170000.0,3369.0,5.363495,43.426052,9750.0,1537
3,1,770000.0,4847.0,-0.489854,47.348189,1.0,1371
4,1,80000.0,500.0,7.446989,43.876029,1.0,9726
...,...,...,...,...,...,...,...
88246,0,222000.0,100.0,4.334271,43.823261,45.0,1638
88247,0,100000.0,1643.0,2.366521,48.902254,9001.0,421
88248,0,750000.0,186.0,4.334271,43.823261,28.0,7487
88249,0,1.0,737.0,4.334271,43.823261,5001.0,9866


# Import the package

In [3]:
import numpy as np
import pandas as pd
import warnings

## Plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt

## Sklearn Libraries
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score, confusion_matrix, roc_curve, auc, \
            classification_report, recall_score, precision_recall_curve

# Define random state
random_state = 2018
np.random.seed(random_state)
warnings.filterwarnings('ignore')

# Choose a writing style

In [4]:
# latex parameter
font = {
    'family': 'serif', 
    'serif': ['Computer Modern Roman'],
    'weight' : 'regular',
    'size'   : 14
    }

plt.rc('font', **font)

# Choose the X features and the y label we try to classify 

In [5]:
X = df.drop(['numero_disposition'], axis = 1) 
y = df['numero_disposition']
X.shape,y.shape

((88251, 6), (88251,))

# Write the model with the right metrics recall and f1-score

In [6]:
class Create_ensemble(object):
    def __init__(self, n_splits, base_models):
        self.n_splits = n_splits
        self.base_models = base_models

    def predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(StratifiedKFold(n_splits=self.n_splits, shuffle=True, 
                                     random_state = random_state).split(X, y))

        train_pred = np.zeros((X.shape[0], len(self.base_models)))
        test_pred = np.zeros((T.shape[0], len(self.base_models)* self.n_splits))
        f1_scores = np.zeros((len(self.base_models), self.n_splits))
        recall_scores = np.zeros((len(self.base_models), self.n_splits))
        
        test_col = 0
        for i, clf in enumerate(self.base_models):
            
            for j, (train_idx, valid_idx) in enumerate(folds):
                
                X_train = X[train_idx]
                Y_train = y[train_idx]
                X_valid = X[valid_idx]
                Y_valid = y[valid_idx]
                
                clf.fit(X_train, Y_train)
                
                valid_pred = clf.predict(X_valid)
                recall  = recall_score(Y_valid, valid_pred, average='macro')
                f1 = f1_score(Y_valid, valid_pred, average='macro')
                
                recall_scores[i][j] = recall
                f1_scores[i][j] = f1
                
                train_pred[valid_idx, i] = valid_pred
                test_pred[:, test_col] = clf.predict(T)
                test_col += 1
                
                print( "Model- {} and CV- {} recall: {}, f1_score: {}".format(i, j, recall, f1))
            
        return train_pred, test_pred, recall_scores, f1_scores

In [7]:
X = df.drop(['numero_disposition'], axis=1)
y = df['numero_disposition'].values

# Split the dataset into a train and a test 

In [8]:

from sklearn.model_selection import train_test_split

X_train,X_test,y_train, y_test = train_test_split(X, y, test_size=0.3)

# Check the shape of our new datasets 

In [9]:
X_train.shape, y_train.shape,X_test,y_test

((61775, 6),
 (61775,),
        valeur_fonciere  surface_terrain  longitude   latitude  adresse_numero  \
 12808          25000.0            132.0   3.497037  50.365464           109.0   
 78713        1092000.0           6041.0   1.780656  49.149405            34.0   
 52971         890000.0            172.0  -0.563106  44.822946            94.0   
 47158        1293990.0          68555.0   0.506377  46.554423             2.0   
 46625         125000.0            500.0   5.930174  43.123358            10.0   
 ...                ...              ...        ...        ...             ...   
 35434         160420.0            239.0   6.137887  47.588363             9.0   
 33909         750000.0            500.0   2.366345  48.823562            66.0   
 2899          275000.0            500.0   3.057417  50.633764            70.0   
 12482         170000.0           2231.0   2.894867  48.170115          9999.0   
 49963        1051772.0            500.0   2.249509  48.819924            

# Choose the best model : Random forest with the same parameters from the ML pipeline

In [10]:
rdf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False,
            random_state=3515,
            verbose=0, warm_start=False)

# Use a synthetic data augmentation method to balance the logistic/ non logistic from 93%/7% to 50%/50%

In [11]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=27)
X_train, y_train = sm.fit_sample(X_train, y_train)

# Use cross validation with 5 fold  using recall and F1-score metrics to score the model avoiding overfitting. Also, tune the model parameters using grid-search CV on 540 combinaison of our random forest parameters.

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

cv = StratifiedKFold(n_splits = 5, shuffle=True, random_state = random_state)

n_splits = 5
base_models = [rdf]
lgb_stack = Create_ensemble(n_splits = n_splits, base_models = base_models)    

rdf = RandomForestClassifier(random_state = random_state) 
scoring = {'Recall': make_scorer(recall_score),
           'f1_score': make_scorer(f1_score)
          }

params = {'max_depth': [6, 8, 10, 20], 
              'min_samples_split': [5, 10, 15],
              'min_samples_leaf' : [4, 8, 12],
              'n_estimators' : [100, 200, 300]
             }


grid_clf = GridSearchCV(estimator = rdf, param_grid = params, cv = cv, n_jobs=-1, verbose=4)
grid_clf.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  90 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 213 tasks      | elapsed: 32.5min
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed: 66.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 111.7min finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=2018, shuffle=True),
             estimator=RandomForestClassifier(random_state=2018), n_jobs=-1,
             param_grid={'max_depth': [6, 8, 10, 20],
                         'min_samples_leaf': [4, 8, 12],
                         'min_samples_split': [5, 10, 15],
                         'n_estimators': [100, 200, 300]},
             verbose=4)

# Check the final parameters of our tune model

In [13]:
print(grid_clf.best_estimator_)
print(grid_clf.best_params_)

RandomForestClassifier(max_depth=20, min_samples_leaf=4, min_samples_split=5,
                       n_estimators=300, random_state=2018)
{'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 300}


# This is our final machine learning model 

In [14]:
rdf1 = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=5,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=-1,
            oob_score=False,
            random_state=random_state,
            verbose=0, warm_start=False)

In [15]:
base_models = [rdf1]
n_splits = 5
lgb_stack = Create_ensemble(n_splits = n_splits, base_models = base_models) 

In [16]:
train_pred, test_pred, recall_scores, f1_scores = lgb_stack.predict(X_train, y_train, X_test)

Model- 0 and CV- 0 recall: 0.9021407739745614, f1_score: 0.9020123905647739
Model- 0 and CV- 1 recall: 0.9012475732424745, f1_score: 0.9011308067729105
Model- 0 and CV- 2 recall: 0.8990330606833288, f1_score: 0.8989058602378429
Model- 0 and CV- 3 recall: 0.9047746576406321, f1_score: 0.9046690106646975
Model- 0 and CV- 4 recall: 0.8989877509356924, f1_score: 0.8988756006632403


# Let's evaluate our final model using metrics (recall,F1-score),classification report and confusion matrix (check for mistakes in the classification)

In [17]:
print('1. The F-1 score of the model {}\n'.format(f1_score(y_train, train_pred, average='macro')))
print('2. The recall score of the model {}\n'.format(recall_score(y_train, train_pred, average='macro')))
print('3. Classification report \n {} \n'.format(classification_report(y_train, train_pred)))
print('4. Confusion matrix \n {} \n'.format(confusion_matrix(y_train, train_pred)))

1. The F-1 score of the model 0.9011189033286693

2. The recall score of the model 0.9012367731618522

3. Classification report 
               precision    recall  f1-score   support

           0       0.88      0.94      0.90     58782
           1       0.93      0.87      0.90     58782

    accuracy                           0.90    117564
   macro avg       0.90      0.90      0.90    117564
weighted avg       0.90      0.90      0.90    117564
 

4. Confusion matrix 
 [[55006  3776]
 [ 7835 50947]] 



# This our final prediction we can create a validation dataset from the data we didn't use in the begining.

In [18]:
tpred = pd.DataFrame(test_pred)
final_tpred = tpred.mode(axis=1)

In [19]:
!pip install jovian
import jovian

Collecting jovian
  Downloading jovian-0.2.32-py2.py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 2.3 MB/s eta 0:00:011
Collecting uuid
  Downloading uuid-1.30.tar.gz (5.8 kB)
Building wheels for collected packages: uuid
  Building wheel for uuid (setup.py) ... [?25ldone
[?25h  Created wheel for uuid: filename=uuid-1.30-py3-none-any.whl size=6501 sha256=36674b6988c7e6354304ea904f00b2ba5d16179d7ad1d0b3dea02e68589a68f5
  Stored in directory: /root/.cache/pip/wheels/2a/ea/87/dd57f1ecb4f0752f3e1dbf958ebf8b36d920d190425bcdc24d
Successfully built uuid
Installing collected packages: uuid, jovian
Successfully installed jovian-0.2.32 uuid-1.30
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


<IPython.core.display.Javascript object>

In [20]:
jovian.commit(project='SNAPKEY RF SMOTE best model')

<IPython.core.display.Javascript object>

[jovian] Attempting to save notebook..[0m
[jovian] Detected Kaggle notebook...[0m
[jovian] Please enter your API key ( from https://jovian.ai/ ):[0m
API KEY: ········
[jovian] Uploading notebook to https://jovian.ai/yeonathan/SNAPKEY RF SMOTE best model[0m


<IPython.core.display.Javascript object>