<a href="https://colab.research.google.com/github/vigilant-umbrella/automatic-quality-estimation/blob/main/dt_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

In [None]:
os.chdir('./drive/My Drive/wikiHow')

# Loading Data

In [None]:
df = pd.read_csv('wikihow.csv')
df.head()

Unnamed: 0,character_count,word_count,method_count,mean_method_size,mean_paragraph_size,size_largest_method,size_shortest_method,std_method_size,step_count,mean_steps_per_method,...,Kincaid,ARI,Coleman_Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex,percent_helpful
0,7107,1508,2,3357.5,379.823529,4321,2394,963.5,16,8.0,...,7.52376,8.15975,9.908877,67.453457,10.820887,36.866542,10.376433,3.264706,10.086871,78
1,5180,1176,2,2419.5,317.0,3171,1668,751.5,14,7.0,...,4.951055,5.505902,7.708411,81.614246,8.897157,33.807519,9.052403,2.652632,9.111023,99
2,3822,902,3,1114.333333,263.333333,1521,881,288.598144,11,3.666667,...,6.020412,6.72745,7.309495,81.109399,9.797251,31.8102,9.310165,2.527273,8.038576,88
3,11614,2429,3,3604.333333,393.111111,4148,2832,561.058721,26,8.666667,...,7.798307,8.728719,10.376311,66.362137,11.577962,38.743177,10.914639,3.584906,9.965255,82
4,9633,2004,4,2310.5,535.411765,2753,1922,329.19637,16,4.0,...,8.716429,9.423549,10.661749,61.827048,13.316999,42.773535,12.116729,4.327869,9.123631,100


In [None]:
df.shape

(19917, 45)

In [None]:
X = df.drop('percent_helpful', axis=1)
y  = df['percent_helpful']

In [None]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def create_pipeline():
    imputer = SimpleImputer(strategy='median')

    scaler = MinMaxScaler()

    model = DecisionTreeRegressor(random_state=42)

    model = TransformedTargetRegressor(
        regressor=model,
        transformer=MinMaxScaler()
    )

    pipeline = Pipeline(
        steps=[
            ('imputer', imputer),
            ('scaling', scaler),
            ('model', model)
        ]
    )

    return pipeline

In [None]:
search_space = {
    'model__regressor__criterion': ['squared_error' , 'friedman_mse', 'absolute_error', 'poisson'],
    'model__regressor__splitter': ['best', 'random'],
    'model__regressor__max_depth': [None, 3, 5, 7],
    'model__regressor__min_samples_split': [2, 5, 0.3, 0.5],
    'model__regressor__max_features': ['auto', 'sqrt', 'log2'],
}

In [None]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

searcher = GridSearchCV(
    create_pipeline(),
    search_space,
    scoring='neg_mean_absolute_error',
    n_jobs=-1,
    cv=cv,
    verbose=10,
)

searcher.fit(X_full_train, y_full_train)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[CV 6/10; 129/384] START model__regressor__criterion=friedman_mse, model__regressor__max_depth=3, model__regressor__max_features=sqrt, model__regressor__min_samples_split=2, model__regressor__splitter=best
[CV 6/10; 129/384] END model__regressor__criterion=friedman_mse, model__regressor__max_depth=3, model__regressor__max_features=sqrt, model__regressor__min_samples_split=2, model__regressor__splitter=best;, score=-5.600 total time=   3.7s
[CV 4/10; 130/384] START model__regressor__criterion=friedman_mse, model__regressor__max_depth=3, model__regressor__max_features=sqrt, model__regressor__min_samples_split=2, model__regressor__splitter=random
[CV 4/10; 130/384] END model__regressor__criterion=friedman_mse, model__regressor__max_depth=3, model__regressor__max_features=sqrt, model__regressor__min_samples_split=2, model__regressor__splitter=random;, score=-5.560 total time=   2.5s
[CV 2/10; 131/384] START model__regressor__

GridSearchCV(cv=KFold(n_splits=10, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('imputer',
                                        SimpleImputer(strategy='median')),
                                       ('scaling', MinMaxScaler()),
                                       ('model',
                                        TransformedTargetRegressor(regressor=DecisionTreeRegressor(random_state=42),
                                                                   transformer=MinMaxScaler()))]),
             n_jobs=-1,
             param_grid={'model__regressor__criterion': ['squared_error',
                                                         'friedman_mse',
                                                         'absolute_error',
                                                         'poisson'],
                         'model__regressor__max_depth': [None, 3, 5, 7],
                         'model__regressor__max_features': ['auto', 'sqrt',
          

In [None]:
searcher.best_score_

-5.441506421995226

In [None]:
searcher.best_params_

{'model__regressor__criterion': 'absolute_error',
 'model__regressor__max_depth': 5,
 'model__regressor__max_features': 'auto',
 'model__regressor__min_samples_split': 5,
 'model__regressor__splitter': 'best'}