<a href="https://colab.research.google.com/github/sumedhakoranga/wikihow_most_helpful_article_predictor/blob/main/Linear_Regression_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Inititalization

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from itertools import product

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("wikihow.csv")

###Filling the missing values with median

In [None]:
for attribute in ['references_count', 'references_count_per_text_length', 'references_count_per_method', 'views', 'co_authors']:
  df[attribute] = df[attribute].fillna(df[attribute].median())

##Preparing data

In [None]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [None]:
df.columns[:-1]

Index(['character_count', 'word_count', 'method_count', 'mean_method_size',
       'mean_paragraph_size', 'size_largest_method', 'size_shortest_method',
       'std_method_size', 'step_count', 'mean_steps_per_method',
       'introduction_size', 'summary_size', 'references_count',
       'references_count_per_text_length', 'references_count_per_method',
       'image_count', 'image_count_per_method', 'num_votes', 'is_expert',
       'conjunction', 'pronoun', 'preposition', 'nominalization',
       'sentence_beginning_pronoun', 'sentence_beginning_interrogative',
       'sentence_beginning_article', 'sentence_beginning_subordination',
       'sentence_beginning_conjunction', 'sentence_beginning_preposition',
       'Kincaid', 'ARI', 'Coleman_Liau', 'FleschReadingEase',
       'GunningFogIndex', 'LIX', 'SMOGIndex', 'RIX', 'DaleChallIndex'],
      dtype='object')

In [None]:
regression_target = ['percent_helpful']

In [None]:
X_train = df_train[df.columns[:-1]]
y_train = df_train[regression_target]['percent_helpful']
X_val = df_val[df.columns[:-1]]
y_val = df_val[regression_target]['percent_helpful']

##Creating a Pipeline

In [None]:
def create_pipeline():
    imputer = SimpleImputer(strategy='median')

    scaler = MinMaxScaler()

    model = LinearRegression()

    model = TransformedTargetRegressor(
        regressor=model,
        transformer=MinMaxScaler()
    )

    pipeline = Pipeline(
        steps=[
            ('imputer', imputer),
            ('scaling', scaler),
            ('model', model)
        ]
    )

    return pipeline

In [None]:
pipeline = create_pipeline()

##Hyperparameter Tuning

In [None]:
search_space = {
    'fit_intercept': [False, True],
    'normalize': [False, True]
}

In [None]:
best_score = float('inf')
best_params = {}

for fit_intercept, normalize in product(*search_space.values()):
    params = {
        'fit_intercept': fit_intercept,
        'normalize': normalize
    }

    pipeline = create_pipeline()

    pipeline.fit(X_train, y_train)

    score = mean_squared_error(pipeline.predict(X_val), y_val, squared=False)
    if score < best_score:
        best_score = score
        best_params = params

In [None]:
best_params

{'fit_intercept': False, 'normalize': False}

In [None]:
best_score

6.8886347921865125

##Training with Linear Regression model

In [None]:
X = df_full_train[df.columns[:-1]]
y = df_full_train[regression_target]['percent_helpful']

In [None]:
pipeline = create_pipeline()

In [None]:
pipeline.fit(X, y)

##Validation

In [None]:
mean_squared_error(pipeline.predict(X), y, squared=False)

6.807932063756883