In [52]:
import pandas as pd
import json
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

import spacy
nlp = spacy.load('en_core_web_md')

In [2]:
original = pd.read_csv('~/Downloads/wikihowSep.csv')
original.head()

Unnamed: 0,overview,headline,text,sectionLabel,title
0,So you're a new or aspiring artist and your c...,\nSell yourself first.,"Before doing anything else, stop and sum up y...",Steps,How to Sell Fine Art Online
1,"If you want to be well-read, then, in the wor...",\nRead the classics before 1600.,Reading the classics is the very first thing ...,Reading the Classics,How to Be Well Read
2,So you're a new or aspiring artist and your c...,\nJoin online artist communities.,Depending on what scale you intend to sell yo...,Steps,How to Sell Fine Art Online
3,So you're a new or aspiring artist and your c...,\nMake yourself public.,Get yourself out there as best as you can by ...,Steps,How to Sell Fine Art Online
4,So you're a new or aspiring artist and your c...,\nBlog about your artwork.,"Given the hundreds of free blogging websites,...",Steps,How to Sell Fine Art Online


In [3]:
# drop rows where at least 1 element is missing
df = original.dropna()

In [4]:
df.shape

(1383743, 5)

In [5]:
def process_instructions(dataframe) -> dict:
    # dictionary of title (article) to text (list of steps)
    wikihow = dict()

    for idx, row in df.iterrows():
        title = row['title']
        text = row['text']

        if title and text:
            if title in wikihow:
                wikihow[title].append(text)
            else:
                wikihow[title] = [text]
    
    return wikihow

In [6]:
# list of list format of the custom corpus for gensim modeling 
def get_text(database: dict) -> list:
    all_text = list()
    
    for article, instructions in database.items():
        text = [article] + instructions
        all_text.append(text)
    
    return all_text

In [7]:
# pairs of (instruction, rank)
def get_instruction_rank(database: dict) -> list:
    instruction_rank = list()
    
    for article, instructions in database.items():
        for idx, step in enumerate(instructions, start=1):
            instruction_rank.append((step, idx / len(instructions)))
    
    return instruction_rank

In [24]:
# convert text instructions to a vector
def convert_vector(instruction_rank: list, num_examples: int) -> list:
    vector_rank = list()
    
    for step in instruction_rank[:num_examples]:
        instruction, rank = step[0], step[1]
        
        doc = nlp(instruction)
        vector_rank.append((doc.vector, rank))
    
    return vector_rank

In [36]:
# split dataset into X, y (train, test)
def split_data(vector_rank: list):
    X = [item[0] for item in vector_rank]
    y = [item[1] for item in vector_rank] 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    
    return X_train, X_test, y_train, y_test

In [8]:
wikihow = process_instructions(dataframe=df)

In [9]:
# number of articles
len(wikihow.keys())

211223

In [13]:
with open('wikihow.json', 'w') as f:
    json.dump(wikihow, f)

In [14]:
instruction_rank = get_instruction_rank(database=wikihow)

In [31]:
%%time
vector_rank = convert_vector(instruction_rank, num_examples=100000)

CPU times: user 19min 51s, sys: 1min 6s, total: 20min 58s
Wall time: 20min 58s


In [32]:
with open('vector_rank.pickle', 'wb') as handle:
    pickle.dump(vector_rank, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [33]:
len(vector_rank)

100000

In [37]:
X_train, X_test, y_train, y_test = split_data(vector_rank)

In [40]:
len(X_train), len(y_train), len(X_test), len(y_test)

(80000, 80000, 20000, 20000)

In [43]:
rf = RandomForestRegressor()

In [45]:
%%time
rf.fit(X_train, y_train)

CPU times: user 34min 16s, sys: 1.59 s, total: 34min 18s
Wall time: 34min 19s


RandomForestRegressor()

In [49]:
rank_predictions = rf.predict(X_test)

rf_mse = mean_squared_error(y_test, rank_predictions)
rf_rmse = np.sqrt(rf_mse)

In [50]:
rf_rmse

0.2823636343517341

In [53]:
mean_absolute_error(y_test, rank_predictions)

0.24373600022702457