<a href="https://colab.research.google.com/github/sumedhakoranga/wikihow_most_helpful_article_predictor/blob/main/XGBoost_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Initialization

In [None]:
import pandas as pd
import numpy as np

from itertools import product
from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv("wikihow.csv")

###Filling the missing values with median

In [None]:
for attribute in ['references_count', 'references_count_per_text_length', 'references_count_per_method', 'views', 'co_authors']:
  df[attribute] = df[attribute].fillna(df[attribute].median())

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [None]:
df_train = df_train.reset_index(drop=True)
df_train.head()

Unnamed: 0,character_count,word_count,method_count,mean_method_size,mean_paragraph_size,size_largest_method,size_shortest_method,std_method_size,step_count,mean_steps_per_method,...,Kincaid,ARI,Coleman_Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex,percent_helpful
0,6896,1580,3,2178.0,361.5,3170,1290,771.021833,17,5.666667,...,4.130898,5.298938,7.465196,87.431663,7.621044,29.242484,7.984351,2.085938,8.526041,79
1,4776,1152,2,2252.5,326.846154,2730,1775,477.5,12,6.0,...,5.270994,6.325446,6.77824,86.584125,9.117579,31.56131,8.593363,2.485714,9.236377,100
2,12400,2673,4,2986.5,547.619048,4872,1922,1120.930306,20,5.0,...,8.686773,9.329607,9.81538,64.522111,12.649886,41.164557,11.590693,4.16,9.458808,89
3,3183,615,3,896.0,198.384615,1005,792,87.028731,12,4.0,...,8.376681,8.976534,12.177431,56.484075,12.498326,42.140124,11.331372,3.627451,9.292553,94
4,6335,1359,3,1973.0,411.0,2363,1600,311.725306,13,4.333333,...,7.365559,7.992772,9.627021,68.854407,10.741838,36.494037,10.307981,3.21978,8.850482,83


In [None]:
df.columns[:-1]

Index(['character_count', 'word_count', 'method_count', 'mean_method_size',
       'mean_paragraph_size', 'size_largest_method', 'size_shortest_method',
       'std_method_size', 'step_count', 'mean_steps_per_method',
       'introduction_size', 'summary_size', 'references_count',
       'references_count_per_text_length', 'references_count_per_method',
       'image_count', 'image_count_per_method', 'num_votes', 'is_expert',
       'conjunction', 'pronoun', 'preposition', 'nominalization',
       'sentence_beginning_pronoun', 'sentence_beginning_interrogative',
       'sentence_beginning_article', 'sentence_beginning_subordination',
       'sentence_beginning_conjunction', 'sentence_beginning_preposition',
       'Kincaid', 'ARI', 'Coleman_Liau', 'FleschReadingEase',
       'GunningFogIndex', 'LIX', 'SMOGIndex', 'RIX', 'DaleChallIndex'],
      dtype='object')

In [None]:
regression_target = ['percent_helpful']

##Splitting the dataset

In [None]:
X_train = df_train[df.columns[:-1]]
y_train = df_train[regression_target]['percent_helpful']
X_val = df_val[df.columns[:-1]]
y_val = df_val[regression_target]['percent_helpful']

In [None]:
X_train = df_train[df.columns[:-1]]
y_train = df_train[regression_target]['percent_helpful']
X_val = df_val[df.columns[:-1]]
y_val = df_val[regression_target]['percent_helpful']

In [None]:
X_train.head()

Unnamed: 0,character_count,word_count,method_count,mean_method_size,mean_paragraph_size,size_largest_method,size_shortest_method,std_method_size,step_count,mean_steps_per_method,...,sentence_beginning_preposition,Kincaid,ARI,Coleman_Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex
0,6896,1580,3,2178.0,361.5,3170,1290,771.021833,17,5.666667,...,6,4.130898,5.298938,7.465196,87.431663,7.621044,29.242484,7.984351,2.085938,8.526041
1,4776,1152,2,2252.5,326.846154,2730,1775,477.5,12,6.0,...,2,5.270994,6.325446,6.77824,86.584125,9.117579,31.56131,8.593363,2.485714,9.236377
2,12400,2673,4,2986.5,547.619048,4872,1922,1120.930306,20,5.0,...,11,8.686773,9.329607,9.81538,64.522111,12.649886,41.164557,11.590693,4.16,9.458808
3,3183,615,3,896.0,198.384615,1005,792,87.028731,12,4.0,...,2,8.376681,8.976534,12.177431,56.484075,12.498326,42.140124,11.331372,3.627451,9.292553
4,6335,1359,3,1973.0,411.0,2363,1600,311.725306,13,4.333333,...,5,7.365559,7.992772,9.627021,68.854407,10.741838,36.494037,10.307981,3.21978,8.850482


In [None]:
y_train.head()

0     79
1    100
2     89
3     94
4     83
Name: percent_helpful, dtype: int64

##Creating a Pipeline

In [None]:
def create_new_pipeline(params):
    numerical_transformer = SimpleImputer(strategy='median')


    preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', numerical_transformer, df.columns[:-1]),
    ])

    scaler = StandardScaler()

    xgb = XGBRegressor(
        objective='reg:squarederror',
        n_jobs=-1,
        random_state=42,
        **params
    )

    pipeline = Pipeline(
    steps=[
           ('preprocessing', preprocessor),
           ('scaling', scaler),
           ('model', xgb)
          ]
    )

    return pipeline

##Hyperparameter Tuning

In [None]:
search_space = {
    'n_estimators': np.linspace(2, 6, num=5),
    'max_depth': np.linspace(1, 4, num=3),
    'learning_rate': np.logspace(-3, 1, num=5),
    'reg_alpha': np.logspace(-1, 1, num=3),
    'reg_lambda': np.logspace(-1, 1, num=3)
}

In [None]:
best_score = float('inf')
best_params = {}

for n_estimators, max_depth, learning_rate, reg_alpha, reg_lambda in tqdm(product(*search_space.values())):
    params = {
        'n_estimators': int(n_estimators),
        'max_depth': 6,
        'learning_rate': learning_rate,
        'reg_alpha': reg_alpha,
        'reg_lambda': reg_lambda
    }

    pipeline = create_new_pipeline(params)

    pipeline.fit(X_train, y_train)

    score = mean_squared_error(pipeline.predict(X_val), y_val, squared=False)
    if score < best_score:
        best_score = score
        best_params = params

675it [03:27,  3.26it/s]


In [None]:
best_params

{'n_estimators': 6,
 'max_depth': 6,
 'learning_rate': 0.1,
 'reg_alpha': 10.0,
 'reg_lambda': 0.1}

In [None]:
best_score

6.831316474384651

##Training

In [None]:
X = df_full_train[df.columns[:-1]]
y = df_full_train[regression_target]['percent_helpful']

In [None]:
pipeline = create_new_pipeline(best_params)

In [None]:
pipeline.fit(X, y)

##Validation

In [None]:
mean_squared_error(pipeline.predict(X), y, squared=False)

6.619280786065114