# Non Linear Regression and Classification with Descision Tree


In [4]:
# Import Data
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [16]:
df = pd.read_csv('data/ad.data', header=None)
explanatory_variable_columns = set(df.columns.values)
response_variable_column = df[len(df.columns.values)-1]
# The last column describes the targets
explanatory_variable_columns.remove(len(df.columns.values)-1)
y = [1 if e == 'ad.' else 0 for e in response_variable_column]
x = df[list(explanatory_variable_columns)]

In [21]:
x.replace(to_replace=' *\?', value=-1, regex=True, inplace=True)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(x, y)

In [23]:
pipeline = Pipeline([
('clf', DecisionTreeClassifier(criterion='entropy'))
])

In [24]:
parameters = {
'clf__max_depth': (150, 155, 160),
'clf__min_samples_split': (1, 2, 3),
'clf__min_samples_leaf': (1, 2, 3)
}

In [25]:
# fitting the model

grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1,
verbose=1, scoring='f1')
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:   22.7s finished


GridSearchCV(estimator=Pipeline(steps=[('clf',
                                        DecisionTreeClassifier(criterion='entropy'))]),
             n_jobs=-1,
             param_grid={'clf__max_depth': (150, 155, 160),
                         'clf__min_samples_leaf': (1, 2, 3),
                         'clf__min_samples_split': (1, 2, 3)},
             scoring='f1', verbose=1)

In [26]:
grid_search.best_score_

0.8787175067150406

In [27]:
grid_search.best_estimator_.get_params()


{'memory': None,
 'steps': [('clf',
   DecisionTreeClassifier(criterion='entropy', max_depth=155, min_samples_split=3))],
 'verbose': False,
 'clf': DecisionTreeClassifier(criterion='entropy', max_depth=155, min_samples_split=3),
 'clf__ccp_alpha': 0.0,
 'clf__class_weight': None,
 'clf__criterion': 'entropy',
 'clf__max_depth': 155,
 'clf__max_features': None,
 'clf__max_leaf_nodes': None,
 'clf__min_impurity_decrease': 0.0,
 'clf__min_impurity_split': None,
 'clf__min_samples_leaf': 1,
 'clf__min_samples_split': 3,
 'clf__min_weight_fraction_leaf': 0.0,
 'clf__presort': 'deprecated',
 'clf__random_state': None,
 'clf__splitter': 'best'}

In [28]:
grid_search.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [31]:
print(classification_report(y_test, grid_search.predict(X_test)))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       698
           1       0.90      0.93      0.91       122

    accuracy                           0.97       820
   macro avg       0.94      0.95      0.95       820
weighted avg       0.97      0.97      0.97       820

