<a href="https://colab.research.google.com/github/vigilant-umbrella/automatic-quality-estimation/blob/main/dt_feature_set_scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

import pandas as pd
from itertools import combinations
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [None]:
os.chdir('./drive/My Drive/wikiHow')

# Loading Data

In [None]:
df = pd.read_csv('wikihow.csv')
df.head()

Unnamed: 0,character_count,word_count,method_count,mean_method_size,mean_paragraph_size,size_largest_method,size_shortest_method,std_method_size,step_count,mean_steps_per_method,...,Kincaid,ARI,Coleman_Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex,percent_helpful
0,7107,1508,2,3357.5,379.823529,4321,2394,963.5,16,8.0,...,7.52376,8.15975,9.908877,67.453457,10.820887,36.866542,10.376433,3.264706,10.086871,78
1,5180,1176,2,2419.5,317.0,3171,1668,751.5,14,7.0,...,4.951055,5.505902,7.708411,81.614246,8.897157,33.807519,9.052403,2.652632,9.111023,99
2,3822,902,3,1114.333333,263.333333,1521,881,288.598144,11,3.666667,...,6.020412,6.72745,7.309495,81.109399,9.797251,31.8102,9.310165,2.527273,8.038576,88
3,11614,2429,3,3604.333333,393.111111,4148,2832,561.058721,26,8.666667,...,7.798307,8.728719,10.376311,66.362137,11.577962,38.743177,10.914639,3.584906,9.965255,82
4,9633,2004,4,2310.5,535.411765,2753,1922,329.19637,16,4.0,...,8.716429,9.423549,10.661749,61.827048,13.316999,42.773535,12.116729,4.327869,9.123631,100


In [None]:
df.shape

(19917, 45)

In [None]:
X = df.drop('percent_helpful', axis=1)
y  = df['percent_helpful']

In [None]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
features = {'structure': ['method_count',
                          'mean_method_size',
                          'mean_paragraph_size',
                          'size_largest_method',
                          'size_shortest_method',
                          'std_method_size',
                          'step_count',
                          'mean_steps_per_method',
                          'introduction_size',
                          'summary_size',
                          'references_count',
                          'references_count_per_text_length',
                          'references_count_per_method',
                          'image_count',
                          'image_count_per_method'],

            'stability': ['num_votes',
                          'is_expert',
                          'views',
                          'co_authors',
                          'warnings',
                          'tips'],

            'style': ['to_be_verb',
                      'aux_verb',
                      'conjunction',
                      'pronoun',
                      'preposition',
                      'nominalization',
                      'sentence_beginning_pronoun',
                      'sentence_beginning_interrogative',
                      'sentence_beginning_article',
                      'sentence_beginning_subordination',
                      'sentence_beginning_conjunction',
                      'sentence_beginning_preposition'],

            'readability': ['Kincaid',
                            'ARI',
                            'Coleman_Liau',
                            'FleschReadingEase',
                            'GunningFogIndex',
                            'LIX',
                            'SMOGIndex',
                            'RIX',
                            'DaleChallIndex'],

            'length': ['character_count',
                       'word_count']}

In [None]:
def create_pipeline():
    imputer = SimpleImputer(strategy='median')

    scaler = MinMaxScaler()

    best_params = {
        'criterion': 'absolute_error',
        'max_depth': 5,
        'max_features': 'auto',
        'min_samples_split': 5,
        'splitter': 'best'
    }

    model = DecisionTreeRegressor(
        random_state=42,
        **best_params
        )

    model = TransformedTargetRegressor(
        regressor=model,
        transformer=MinMaxScaler()
    )

    pipeline = Pipeline(
        steps=[
            ('imputer', imputer),
            ('scaling', scaler),
            ('model', model)
        ]
    )

    return pipeline

In [None]:
feature_combinations = []
for r in range(1, 6):
    feature_combinations += list(combinations(features.keys(), r))

In [None]:
scores = {}

for feature_combination in feature_combinations:
    features_to_use = []
    for feature_type in feature_combination:
        features_to_use += features[feature_type]
        

    cv = KFold(n_splits=10, shuffle=True, random_state=42)

    score = cross_validate(
        create_pipeline(),
        X_full_train[features_to_use],
        y_full_train,
        scoring=['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
        cv=cv,
        n_jobs=-1,
        verbose=5,
        return_train_score=True
    )

    mean_score = {}
    for key, value in score.items():
        mean_score[key] = np.mean(value)

    scores['+'.join(feature_combination)] = mean_score

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   28.4s remaining:   42.6s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   28.8s remaining:   12.4s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   40.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   40.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   11.0s remaining:   16.5s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   11.4s remaining:    4.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:   20.1s remaining:   30.1s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   20.6s r

In [None]:
scores

{'structure': {'fit_time': 23.329437589645387,
  'score_time': 0.00459444522857666,
  'test_neg_mean_absolute_error': -5.678681078841639,
  'train_neg_mean_absolute_error': -5.5597326930104405,
  'test_neg_root_mean_squared_error': -6.995288554025625,
  'train_neg_root_mean_squared_error': -6.883922292539795},
 'stability': {'fit_time': 9.830549192428588,
  'score_time': 0.005188751220703125,
  'test_neg_mean_absolute_error': -5.455507411266827,
  'train_neg_mean_absolute_error': -5.35258773436171,
  'test_neg_root_mean_squared_error': -6.910276977113331,
  'train_neg_root_mean_squared_error': -6.774311155097692},
 'style': {'fit_time': 17.89471502304077,
  'score_time': 0.0038750886917114256,
  'test_neg_mean_absolute_error': -5.7070862879552235,
  'train_neg_mean_absolute_error': -5.6004657310241175,
  'test_neg_root_mean_squared_error': -7.0115894350970605,
  'train_neg_root_mean_squared_error': -6.915324184070217},
 'readability': {'fit_time': 24.498447799682616,
  'score_time': 0.

In [None]:
pd.DataFrame(scores).T.sort_values(by='train_neg_mean_absolute_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
structure+stability+style+readability+length,88.800827,0.013441,-5.443953,-5.326178,-6.930904,-6.75105
structure+stability+readability,69.422508,0.011487,-5.438148,-5.326701,-6.918638,-6.753706
structure+stability+style+readability,86.493473,0.014563,-5.441506,-5.326785,-6.92571,-6.754105
structure+stability+readability+length,72.036992,0.012454,-5.439967,-5.326897,-6.923264,-6.75397
structure+stability+length,45.112963,0.009208,-5.450259,-5.334393,-6.921734,-6.757815
structure+stability,37.819728,0.010091,-5.45142,-5.334442,-6.931883,-6.757387
structure+stability+style,55.487765,0.011615,-5.454308,-5.33523,-6.941166,-6.76024
structure+stability+style+length,59.582564,0.013216,-5.455469,-5.336004,-6.940435,-6.764348
stability+readability+length,47.765892,0.007808,-5.448228,-5.341249,-6.897737,-6.76437
stability+style+readability,58.271242,0.010316,-5.451084,-5.342267,-6.89968,-6.771904


In [None]:
pd.DataFrame(scores).T.sort_values(by='test_neg_mean_absolute_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
structure+stability+readability,69.422508,0.011487,-5.438148,-5.326701,-6.918638,-6.753706
structure+stability+readability+length,72.036992,0.012454,-5.439967,-5.326897,-6.923264,-6.75397
structure+stability+style+readability,86.493473,0.014563,-5.441506,-5.326785,-6.92571,-6.754105
structure+stability+style+readability+length,88.800827,0.013441,-5.443953,-5.326178,-6.930904,-6.75105
stability+readability+length,47.765892,0.007808,-5.448228,-5.341249,-6.897737,-6.76437
structure+stability+length,45.112963,0.009208,-5.450259,-5.334393,-6.921734,-6.757815
stability+style+readability,58.271242,0.010316,-5.451084,-5.342267,-6.89968,-6.771904
structure+stability,37.819728,0.010091,-5.45142,-5.334442,-6.931883,-6.757387
stability+readability,39.323726,0.007753,-5.451772,-5.34332,-6.925282,-6.77177
stability+style+readability+length,61.532429,0.010123,-5.453375,-5.344059,-6.904521,-6.777818


In [None]:
pd.DataFrame(scores).T.sort_values(by='train_neg_root_mean_squared_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
structure+stability+style+readability+length,88.800827,0.013441,-5.443953,-5.326178,-6.930904,-6.75105
structure+stability+readability,69.422508,0.011487,-5.438148,-5.326701,-6.918638,-6.753706
structure+stability+readability+length,72.036992,0.012454,-5.439967,-5.326897,-6.923264,-6.75397
structure+stability+style+readability,86.493473,0.014563,-5.441506,-5.326785,-6.92571,-6.754105
structure+stability,37.819728,0.010091,-5.45142,-5.334442,-6.931883,-6.757387
structure+stability+length,45.112963,0.009208,-5.450259,-5.334393,-6.921734,-6.757815
structure+stability+style,55.487765,0.011615,-5.454308,-5.33523,-6.941166,-6.76024
structure+stability+style+length,59.582564,0.013216,-5.455469,-5.336004,-6.940435,-6.764348
stability+readability+length,47.765892,0.007808,-5.448228,-5.341249,-6.897737,-6.76437
stability+readability,39.323726,0.007753,-5.451772,-5.34332,-6.925282,-6.77177


In [None]:
pd.DataFrame(scores).T.sort_values(by='test_neg_root_mean_squared_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
stability+readability+length,47.765892,0.007808,-5.448228,-5.341249,-6.897737,-6.76437
stability+style+readability,58.271242,0.010316,-5.451084,-5.342267,-6.89968,-6.771904
stability+style+readability+length,61.532429,0.010123,-5.453375,-5.344059,-6.904521,-6.777818
stability,9.830549,0.005189,-5.455507,-5.352588,-6.910277,-6.774311
structure+stability+readability,69.422508,0.011487,-5.438148,-5.326701,-6.918638,-6.753706
stability+style,25.568363,0.007597,-5.472076,-5.357044,-6.920278,-6.787477
structure+stability+length,45.112963,0.009208,-5.450259,-5.334393,-6.921734,-6.757815
structure+stability+readability+length,72.036992,0.012454,-5.439967,-5.326897,-6.923264,-6.75397
stability+readability,39.323726,0.007753,-5.451772,-5.34332,-6.925282,-6.77177
structure+stability+style+readability,86.493473,0.014563,-5.441506,-5.326785,-6.92571,-6.754105
