<a href="https://colab.research.google.com/github/vigilant-umbrella/automatic-quality-estimation/blob/main/lr_feature_set_scores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

import pandas as pd
from itertools import combinations
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

In [None]:
os.chdir('./drive/My Drive/wikiHow')

# Loading Data

In [None]:
df = pd.read_csv('wikihow.csv')
df.head()

Unnamed: 0,character_count,word_count,method_count,mean_method_size,mean_paragraph_size,size_largest_method,size_shortest_method,std_method_size,step_count,mean_steps_per_method,...,Kincaid,ARI,Coleman_Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex,percent_helpful
0,7107,1508,2,3357.5,379.823529,4321,2394,963.5,16,8.0,...,7.52376,8.15975,9.908877,67.453457,10.820887,36.866542,10.376433,3.264706,10.086871,78
1,5180,1176,2,2419.5,317.0,3171,1668,751.5,14,7.0,...,4.951055,5.505902,7.708411,81.614246,8.897157,33.807519,9.052403,2.652632,9.111023,99
2,3822,902,3,1114.333333,263.333333,1521,881,288.598144,11,3.666667,...,6.020412,6.72745,7.309495,81.109399,9.797251,31.8102,9.310165,2.527273,8.038576,88
3,11614,2429,3,3604.333333,393.111111,4148,2832,561.058721,26,8.666667,...,7.798307,8.728719,10.376311,66.362137,11.577962,38.743177,10.914639,3.584906,9.965255,82
4,9633,2004,4,2310.5,535.411765,2753,1922,329.19637,16,4.0,...,8.716429,9.423549,10.661749,61.827048,13.316999,42.773535,12.116729,4.327869,9.123631,100


In [None]:
df.shape

(19917, 45)

In [None]:
X = df.drop('percent_helpful', axis=1)
y  = df['percent_helpful']

In [None]:
X_full_train, X_test, y_full_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
features = {'structure': ['method_count',
                          'mean_method_size',
                          'mean_paragraph_size',
                          'size_largest_method',
                          'size_shortest_method',
                          'std_method_size',
                          'step_count',
                          'mean_steps_per_method',
                          'introduction_size',
                          'summary_size',
                          'references_count',
                          'references_count_per_text_length',
                          'references_count_per_method',
                          'image_count',
                          'image_count_per_method'],

            'stability': ['num_votes',
                          'is_expert',
                          'views',
                          'co_authors',
                          'warnings',
                          'tips'],

            'style': ['to_be_verb',
                      'aux_verb',
                      'conjunction',
                      'pronoun',
                      'preposition',
                      'nominalization',
                      'sentence_beginning_pronoun',
                      'sentence_beginning_interrogative',
                      'sentence_beginning_article',
                      'sentence_beginning_subordination',
                      'sentence_beginning_conjunction',
                      'sentence_beginning_preposition'],

            'readability': ['Kincaid',
                            'ARI',
                            'Coleman_Liau',
                            'FleschReadingEase',
                            'GunningFogIndex',
                            'LIX',
                            'SMOGIndex',
                            'RIX',
                            'DaleChallIndex'],

            'length': ['character_count',
                       'word_count']}

In [None]:
def create_pipeline():
    imputer = SimpleImputer(strategy='median')

    scaler = MinMaxScaler()

    best_params = {'fit_intercept': False}

    model = LinearRegression(**best_params)

    model = TransformedTargetRegressor(
        regressor=model,
        transformer=MinMaxScaler()
    )

    pipeline = Pipeline(
        steps=[
            ('imputer', imputer),
            ('scaling', scaler),
            ('model', model)
        ]
    )

    return pipeline

In [None]:
feature_combinations = []
for r in range(1, 6):
    feature_combinations += list(combinations(features.keys(), r))

In [None]:
scores = {}

for feature_combination in feature_combinations:
    features_to_use = []
    for feature_type in feature_combination:
        features_to_use += features[feature_type]
        

    cv = KFold(n_splits=10, shuffle=True, random_state=42)

    score = cross_validate(
        create_pipeline(),
        X_full_train[features_to_use],
        y_full_train,
        scoring=['neg_mean_absolute_error', 'neg_root_mean_squared_error'],
        cv=cv,
        n_jobs=-1,
        verbose=5,
        return_train_score=True
    )

    mean_score = {}
    for key, value in score.items():
        mean_score[key] = np.mean(value)

    scores['+'.join(feature_combination)] = mean_score

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    2.1s remaining:    3.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    2.3s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    2.4s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed:    0.1s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.1s r

In [None]:
scores

{'structure': {'fit_time': 0.15064926147460939,
  'score_time': 0.009574389457702637,
  'test_neg_mean_absolute_error': -6.198166685777413,
  'train_neg_mean_absolute_error': -6.186717659867623,
  'test_neg_root_mean_squared_error': -7.81866056809812,
  'train_neg_root_mean_squared_error': -7.788883179595098},
 'stability': {'fit_time': 0.04818096160888672,
  'score_time': 0.005311203002929687,
  'test_neg_mean_absolute_error': -11.227583187701569,
  'train_neg_mean_absolute_error': -11.219385165580363,
  'test_neg_root_mean_squared_error': -15.40378917562557,
  'train_neg_root_mean_squared_error': -15.383913474156547},
 'style': {'fit_time': 0.06917593479156495,
  'score_time': 0.008000349998474121,
  'test_neg_mean_absolute_error': -13.159598880075686,
  'train_neg_mean_absolute_error': -13.143793227149377,
  'test_neg_root_mean_squared_error': -16.307174390679766,
  'train_neg_root_mean_squared_error': -16.284315680006248},
 'readability': {'fit_time': 0.07654948234558105,
  'score_

In [None]:
pd.DataFrame(scores).T.sort_values(by='train_neg_mean_absolute_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
structure+stability+style+readability+length,0.708886,0.02496,-5.542329,-5.523301,-6.832905,-6.806767
structure+stability+style+readability,0.692496,0.025592,-5.542823,-5.524834,-6.834691,-6.809983
stability+style+readability+length,0.515175,0.018419,-5.557888,-5.546429,-6.839863,-6.826501
stability+style+readability,0.39399,0.024247,-5.557266,-5.547388,-6.840561,-6.828992
structure+stability+readability+length,0.427441,0.019667,-5.563184,-5.547664,-6.850294,-6.828937
structure+stability+readability,0.536549,0.026432,-5.562669,-5.548941,-6.850977,-6.831519
stability+readability+length,0.260568,0.017493,-5.577214,-5.570214,-6.856587,-6.848251
stability+readability,0.276341,0.015122,-5.576509,-5.570394,-6.857023,-6.850182
structure+style+readability+length,0.268374,0.011768,-5.598069,-5.580835,-6.885022,-6.861656
structure+style+readability,0.22729,0.00876,-5.598715,-5.582364,-6.887439,-6.86541


In [None]:
pd.DataFrame(scores).T.sort_values(by='test_neg_mean_absolute_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
structure+stability+style+readability+length,0.708886,0.02496,-5.542329,-5.523301,-6.832905,-6.806767
structure+stability+style+readability,0.692496,0.025592,-5.542823,-5.524834,-6.834691,-6.809983
stability+style+readability,0.39399,0.024247,-5.557266,-5.547388,-6.840561,-6.828992
stability+style+readability+length,0.515175,0.018419,-5.557888,-5.546429,-6.839863,-6.826501
structure+stability+readability,0.536549,0.026432,-5.562669,-5.548941,-6.850977,-6.831519
structure+stability+readability+length,0.427441,0.019667,-5.563184,-5.547664,-6.850294,-6.828937
stability+readability,0.276341,0.015122,-5.576509,-5.570394,-6.857023,-6.850182
stability+readability+length,0.260568,0.017493,-5.577214,-5.570214,-6.856587,-6.848251
structure+style+readability+length,0.268374,0.011768,-5.598069,-5.580835,-6.885022,-6.861656
structure+style+readability,0.22729,0.00876,-5.598715,-5.582364,-6.887439,-6.86541


In [None]:
pd.DataFrame(scores).T.sort_values(by='train_neg_root_mean_squared_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
structure+stability+style+readability+length,0.708886,0.02496,-5.542329,-5.523301,-6.832905,-6.806767
structure+stability+style+readability,0.692496,0.025592,-5.542823,-5.524834,-6.834691,-6.809983
stability+style+readability+length,0.515175,0.018419,-5.557888,-5.546429,-6.839863,-6.826501
structure+stability+readability+length,0.427441,0.019667,-5.563184,-5.547664,-6.850294,-6.828937
stability+style+readability,0.39399,0.024247,-5.557266,-5.547388,-6.840561,-6.828992
structure+stability+readability,0.536549,0.026432,-5.562669,-5.548941,-6.850977,-6.831519
stability+readability+length,0.260568,0.017493,-5.577214,-5.570214,-6.856587,-6.848251
stability+readability,0.276341,0.015122,-5.576509,-5.570394,-6.857023,-6.850182
structure+style+readability+length,0.268374,0.011768,-5.598069,-5.580835,-6.885022,-6.861656
structure+style+readability,0.22729,0.00876,-5.598715,-5.582364,-6.887439,-6.86541


In [None]:
pd.DataFrame(scores).T.sort_values(by='test_neg_root_mean_squared_error', ascending=False)

Unnamed: 0,fit_time,score_time,test_neg_mean_absolute_error,train_neg_mean_absolute_error,test_neg_root_mean_squared_error,train_neg_root_mean_squared_error
structure+stability+style+readability+length,0.708886,0.02496,-5.542329,-5.523301,-6.832905,-6.806767
structure+stability+style+readability,0.692496,0.025592,-5.542823,-5.524834,-6.834691,-6.809983
stability+style+readability+length,0.515175,0.018419,-5.557888,-5.546429,-6.839863,-6.826501
stability+style+readability,0.39399,0.024247,-5.557266,-5.547388,-6.840561,-6.828992
structure+stability+readability+length,0.427441,0.019667,-5.563184,-5.547664,-6.850294,-6.828937
structure+stability+readability,0.536549,0.026432,-5.562669,-5.548941,-6.850977,-6.831519
stability+readability+length,0.260568,0.017493,-5.577214,-5.570214,-6.856587,-6.848251
stability+readability,0.276341,0.015122,-5.576509,-5.570394,-6.857023,-6.850182
structure+style+readability+length,0.268374,0.011768,-5.598069,-5.580835,-6.885022,-6.861656
structure+style+readability,0.22729,0.00876,-5.598715,-5.582364,-6.887439,-6.86541
