<a href="https://colab.research.google.com/github/vigilant-umbrella/automatic-quality-estimation/blob/main/knn_feature_set_scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

import pandas as pd
from itertools import combinations
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [None]:
os.chdir('./drive/My Drive/wikiHow')

# Loading Data

In [None]:
df = pd.read_csv('wikihow.csv')
df.head()

Unnamed: 0,character_count,word_count,method_count,mean_method_size,mean_paragraph_size,size_largest_method,size_shortest_method,std_method_size,step_count,mean_steps_per_method,...,Kincaid,ARI,Coleman_Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex,percent_helpful
0,7107,1508,2,3357.5,379.823529,4321,2394,963.5,16,8.0,...,7.52376,8.15975,9.908877,67.453457,10.820887,36.866542,10.376433,3.264706,10.086871,78
1,5180,1176,2,2419.5,317.0,3171,1668,751.5,14,7.0,...,4.951055,5.505902,7.708411,81.614246,8.897157,33.807519,9.052403,2.652632,9.111023,99
2,3822,902,3,1114.333333,263.333333,1521,881,288.598144,11,3.666667,...,6.020412,6.72745,7.309495,81.109399,9.797251,31.8102,9.310165,2.527273,8.038576,88
3,11614,2429,3,3604.333333,393.111111,4148,2832,561.058721,26,8.666667,...,7.798307,8.728719,10.376311,66.362137,11.577962,38.743177,10.914639,3.584906,9.965255,82
4,9633,2004,4,2310.5,535.411765,2753,1922,329.19637,16,4.0,...,8.716429,9.423549,10.661749,61.827048,13.316999,42.773535,12.116729,4.327869,9.123631,100


In [None]:
df.shape

(19917, 45)

In [None]:
X = df.drop('percent_helpful', axis=1)
y  = df['percent_helpful']

In [None]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
features = {'structure': ['method_count',
                          'mean_method_size',
                          'mean_paragraph_size',
                          'size_largest_method',
                          'size_shortest_method',
                          'std_method_size',
                          'step_count',
                          'mean_steps_per_method',
                          'introduction_size',
                          'summary_size',
                          'references_count',
                          'references_count_per_text_length',
                          'references_count_per_method',
                          'image_count',
                          'image_count_per_method'],

            'stability': ['num_votes',
                          'is_expert',
                          'views',
                          'co_authors',
                          'warnings',
                          'tips'],

            'style': ['to_be_verb',
                      'aux_verb',
                      'conjunction',
                      'pronoun',
                      'preposition',
                      'nominalization',
                      'sentence_beginning_pronoun',
                      'sentence_beginning_interrogative',
                      'sentence_beginning_article',
                      'sentence_beginning_subordination',
                      'sentence_beginning_conjunction',
                      'sentence_beginning_preposition'],

            'readability': ['Kincaid',
                            'ARI',
                            'Coleman_Liau',
                            'FleschReadingEase',
                            'GunningFogIndex',
                            'LIX',
                            'SMOGIndex',
                            'RIX',
                            'DaleChallIndex'],

            'length': ['character_count',
                       'word_count']}

In [None]:
def create_pipeline():
    imputer = SimpleImputer(strategy='median')

    scaler = MinMaxScaler()

    best_params = {
        'n_neighbors': 7,
        'weights': 'uniform',
        'p': 5
    }

    model = KNeighborsRegressor(
        n_jobs=-1,
        **best_params
    )

    model = TransformedTargetRegressor(
        regressor=model,
        transformer=MinMaxScaler()
    )

    pipeline = Pipeline(
        steps=[
            ('imputer', imputer),
            ('scaling', scaler),
            ('model', model)
        ]
    )

    return pipeline

In [None]:
feature_combinations = []
for r in range(1, 6):
    feature_combinations += list(combinations(features.keys(), r))

In [None]:
scores = {}

for feature_combination in feature_combinations:
    features_to_use = []
    for feature_type in feature_combination:
        features_to_use += features[feature_type]
        

    cv = KFold(n_splits=10, shuffle=True, random_state=42)

    score = cross_validate(
        create_pipeline(),
        X_full_train[features_to_use],
        y_full_train,
        scoring=['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
        cv=cv,
        n_jobs=4,
        verbose=5,
        return_train_score=True
    )

    mean_score = {}
    for key, value in score.items():
        mean_score[key] = np.mean(value)

    scores['+'.join(feature_combination)] = mean_score

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:   22.5s remaining:   15.0s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   27.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:    4.9s remaining:    3.3s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    6.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:   22.4s remaining:   15.0s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   28.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:    5.1s remaining:    3.4s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    6.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Don

[CV] END  neg_mean_absolute_error: (train=-5.157, test=-6.124) neg_root_mean_squared_error: (train=-6.389, test=-7.572) total time=   1.4s
[CV] END  neg_mean_absolute_error: (train=-5.186, test=-5.954) neg_root_mean_squared_error: (train=-6.428, test=-7.328) total time=   1.3s
[CV] END  neg_mean_absolute_error: (train=-5.047, test=-5.742) neg_root_mean_squared_error: (train=-6.218, test=-7.126) total time=   0.2s
[CV] END  neg_mean_absolute_error: (train=-5.041, test=-5.793) neg_root_mean_squared_error: (train=-6.225, test=-7.105) total time=   0.3s
[CV] END  neg_mean_absolute_error: (train=-5.001, test=-6.011) neg_root_mean_squared_error: (train=-6.190, test=-7.293) total time=   0.4s
[CV] END  neg_mean_absolute_error: (train=-5.196, test=-5.972) neg_root_mean_squared_error: (train=-6.418, test=-7.447) total time=   1.2s
[CV] END  neg_mean_absolute_error: (train=-5.185, test=-6.085) neg_root_mean_squared_error: (train=-6.410, test=-7.462) total time=   1.4s
[CV] END  neg_mean_absolute

[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:  3.8min remaining:  2.5min
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  4.8min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


[CV] END  neg_mean_absolute_error: (train=-5.157, test=-6.038) neg_root_mean_squared_error: (train=-6.387, test=-7.444) total time=   1.3s
[CV] END  neg_mean_absolute_error: (train=-5.172, test=-6.027) neg_root_mean_squared_error: (train=-6.403, test=-7.391) total time=   1.2s
[CV] END  neg_mean_absolute_error: (train=-5.185, test=-5.844) neg_root_mean_squared_error: (train=-6.413, test=-7.350) total time=   0.7s
[CV] END  neg_mean_absolute_error: (train=-5.044, test=-5.815) neg_root_mean_squared_error: (train=-6.220, test=-7.209) total time=   0.4s
[CV] END  neg_mean_absolute_error: (train=-5.034, test=-5.921) neg_root_mean_squared_error: (train=-6.206, test=-7.346) total time=   0.5s
[CV] END  neg_mean_absolute_error: (train=-5.200, test=-5.985) neg_root_mean_squared_error: (train=-6.420, test=-7.448) total time=   1.2s
[CV] END  neg_mean_absolute_error: (train=-5.209, test=-5.894) neg_root_mean_squared_error: (train=-6.426, test=-7.304) total time=   1.3s
[CV] END  neg_mean_absolute

[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:  3.3min remaining:  2.2min


[CV] END  neg_mean_absolute_error: (train=-5.172, test=-5.980) neg_root_mean_squared_error: (train=-6.389, test=-7.429) total time=   1.3s
[CV] END  neg_mean_absolute_error: (train=-5.173, test=-5.990) neg_root_mean_squared_error: (train=-6.400, test=-7.389) total time=   1.2s
[CV] END  neg_mean_absolute_error: (train=-5.020, test=-5.962) neg_root_mean_squared_error: (train=-6.214, test=-7.336) total time=   0.2s
[CV] END  neg_mean_absolute_error: (train=-5.052, test=-5.903) neg_root_mean_squared_error: (train=-6.230, test=-7.238) total time=   0.3s
[CV] END  neg_mean_absolute_error: (train=-5.052, test=-5.657) neg_root_mean_squared_error: (train=-6.240, test=-7.084) total time=   0.4s
[CV] END  neg_mean_absolute_error: (train=-5.215, test=-5.937) neg_root_mean_squared_error: (train=-6.429, test=-7.440) total time=   1.2s
[CV] END  neg_mean_absolute_error: (train=-5.196, test=-6.006) neg_root_mean_squared_error: (train=-6.427, test=-7.350) total time=   1.2s
[CV] END  neg_mean_absolute

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  4.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


[CV] END  neg_mean_absolute_error: (train=-5.168, test=-5.952) neg_root_mean_squared_error: (train=-6.390, test=-7.477) total time=   1.0s
[CV] END  neg_mean_absolute_error: (train=-5.159, test=-6.020) neg_root_mean_squared_error: (train=-6.390, test=-7.433) total time=   0.6s
[CV] END  neg_mean_absolute_error: (train=-5.178, test=-5.992) neg_root_mean_squared_error: (train=-6.407, test=-7.317) total time=   1.0s
[CV] END  neg_mean_absolute_error: (train=-5.053, test=-5.599) neg_root_mean_squared_error: (train=-6.232, test=-7.031) total time=   0.4s
[CV] END  neg_mean_absolute_error: (train=-5.034, test=-5.836) neg_root_mean_squared_error: (train=-6.218, test=-7.143) total time=   0.5s
[CV] END  neg_mean_absolute_error: (train=-5.190, test=-6.072) neg_root_mean_squared_error: (train=-6.408, test=-7.434) total time=   1.2s
[CV] END  neg_mean_absolute_error: (train=-5.184, test=-6.130) neg_root_mean_squared_error: (train=-6.415, test=-7.559) total time=   1.1s
[CV] END  neg_mean_absolute

[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:  4.4min remaining:  2.9min
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  5.5min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:  8.2min remaining:  5.5min
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 10.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:  6.6min remaining:  4.4min
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  8.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:  6.0min remaining:  4.0min
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:  7.5min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed:  7.3min remaining:  4.9min
[Parallel(n_jobs=4)]

In [None]:
scores

{'structure': {'fit_time': 0.154203200340271,
  'score_time': 0.9495717048645019,
  'test_neg_mean_absolute_error': -5.992104792352544,
  'train_neg_mean_absolute_error': -5.170620117158258,
  'test_neg_root_mean_squared_error': -7.413112647608689,
  'train_neg_root_mean_squared_error': -6.39972369097},
 'stability': {'fit_time': 0.08696749210357665,
  'score_time': 0.26792943477630615,
  'test_neg_mean_absolute_error': -5.824005650955229,
  'train_neg_mean_absolute_error': -5.03778724902835,
  'test_neg_root_mean_squared_error': -7.190957851225372,
  'train_neg_root_mean_squared_error': -6.219286446981696},
 'style': {'fit_time': 0.11443240642547607,
  'score_time': 1.0502061367034912,
  'test_neg_mean_absolute_error': -6.006822682854624,
  'train_neg_mean_absolute_error': -5.19739699185636,
  'test_neg_root_mean_squared_error': -7.415818330902856,
  'train_neg_root_mean_squared_error': -6.420816595687822},
 'readability': {'fit_time': 0.10536589622497558,
  'score_time': 0.2343692779

In [None]:
pd.DataFrame(scores).T.sort_values(by='train_neg_mean_absolute_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
stability,0.086967,0.267929,-5.824006,-5.037787,-7.190958,-6.219286
structure+stability+readability,0.276352,14.893114,-5.840213,-5.052509,-7.245012,-6.273821
structure+stability+readability+length,0.279695,15.937382,-5.853028,-5.056195,-7.253605,-6.277214
stability+length,0.127769,0.453894,-5.879155,-5.085708,-7.269821,-6.296826
structure+stability+style+readability+length,0.484235,24.596392,-5.875293,-5.093329,-7.285113,-6.305577
stability+style+readability+length,0.263776,14.800686,-5.889988,-5.093384,-7.275318,-6.30465
structure+stability+style+readability,0.324623,22.24231,-5.869061,-5.093753,-7.277793,-6.306346
structure+stability+style,0.364998,17.507655,-5.896605,-5.094117,-7.298796,-6.315168
stability+style+readability,0.20538,13.633919,-5.889574,-5.094615,-7.277708,-6.306563
structure+stability+style+length,0.297524,17.464146,-5.89466,-5.096721,-7.298226,-6.31698


In [None]:
pd.DataFrame(scores).T.sort_values(by='test_neg_mean_absolute_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
stability,0.086967,0.267929,-5.824006,-5.037787,-7.190958,-6.219286
structure+stability+readability,0.276352,14.893114,-5.840213,-5.052509,-7.245012,-6.273821
structure+stability+readability+length,0.279695,15.937382,-5.853028,-5.056195,-7.253605,-6.277214
structure+stability+style+readability,0.324623,22.24231,-5.869061,-5.093753,-7.277793,-6.306346
stability+readability,0.315723,1.64271,-5.874695,-5.098534,-7.265863,-6.308878
structure+stability+style+readability+length,0.484235,24.596392,-5.875293,-5.093329,-7.285113,-6.305577
stability+length,0.127769,0.453894,-5.879155,-5.085708,-7.269821,-6.296826
stability+style+readability,0.20538,13.633919,-5.889574,-5.094615,-7.277708,-6.306563
stability+style+readability+length,0.263776,14.800686,-5.889988,-5.093384,-7.275318,-6.30465
structure+stability+style+length,0.297524,17.464146,-5.89466,-5.096721,-7.298226,-6.31698


In [None]:
pd.DataFrame(scores).T.sort_values(by='train_neg_root_mean_squared_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
stability,0.086967,0.267929,-5.824006,-5.037787,-7.190958,-6.219286
structure+stability+readability,0.276352,14.893114,-5.840213,-5.052509,-7.245012,-6.273821
structure+stability+readability+length,0.279695,15.937382,-5.853028,-5.056195,-7.253605,-6.277214
stability+length,0.127769,0.453894,-5.879155,-5.085708,-7.269821,-6.296826
stability+style+readability+length,0.263776,14.800686,-5.889988,-5.093384,-7.275318,-6.30465
structure+stability+style+readability+length,0.484235,24.596392,-5.875293,-5.093329,-7.285113,-6.305577
structure+stability+style+readability,0.324623,22.24231,-5.869061,-5.093753,-7.277793,-6.306346
stability+style+readability,0.20538,13.633919,-5.889574,-5.094615,-7.277708,-6.306563
stability+readability,0.315723,1.64271,-5.874695,-5.098534,-7.265863,-6.308878
structure+stability,0.258455,10.951226,-5.906954,-5.101915,-7.311054,-6.31335


In [None]:
pd.DataFrame(scores).T.sort_values(by='test_neg_root_mean_squared_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
stability,0.086967,0.267929,-5.824006,-5.037787,-7.190958,-6.219286
structure+stability+readability,0.276352,14.893114,-5.840213,-5.052509,-7.245012,-6.273821
structure+stability+readability+length,0.279695,15.937382,-5.853028,-5.056195,-7.253605,-6.277214
stability+readability,0.315723,1.64271,-5.874695,-5.098534,-7.265863,-6.308878
stability+length,0.127769,0.453894,-5.879155,-5.085708,-7.269821,-6.296826
stability+style+readability+length,0.263776,14.800686,-5.889988,-5.093384,-7.275318,-6.30465
stability+style+readability,0.20538,13.633919,-5.889574,-5.094615,-7.277708,-6.306563
structure+stability+style+readability,0.324623,22.24231,-5.869061,-5.093753,-7.277793,-6.306346
structure+stability+style+readability+length,0.484235,24.596392,-5.875293,-5.093329,-7.285113,-6.305577
stability+readability+length,0.172382,8.73528,-5.900828,-5.113435,-7.291149,-6.334128
