<a href="https://colab.research.google.com/github/vigilant-umbrella/automatic-quality-estimation/blob/main/data_creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%shell
pip install -q wikihowunofficialapi readability syntok

  Building wheel for readability (setup.py) ... [?25l[?25hdone




In [None]:
import os
import sys

import pandas as pd
import wikihowunofficialapi as wha

import readability
import syntok.segmenter as segmenter

import numpy as np

In [None]:
os.chdir('./drive/My Drive/wikiHow')

In [None]:
if sys.version_info[1] >= 8:
  data = pd.read_pickle('wikihow.pickle')
  data.head()
else:
  ! pip install -q pickle5
  import pickle5
  with open('wikihow.pickle', 'rb') as f:
    data = pickle5.load(f)

data.head()

[?25l[K     |█▎                              | 10 kB 26.3 MB/s eta 0:00:01[K     |██▋                             | 20 kB 8.8 MB/s eta 0:00:01[K     |███▉                            | 30 kB 7.5 MB/s eta 0:00:01[K     |█████▏                          | 40 kB 3.5 MB/s eta 0:00:01[K     |██████▍                         | 51 kB 3.5 MB/s eta 0:00:01[K     |███████▊                        | 61 kB 4.2 MB/s eta 0:00:01[K     |█████████                       | 71 kB 4.4 MB/s eta 0:00:01[K     |██████████▎                     | 81 kB 4.4 MB/s eta 0:00:01[K     |███████████▌                    | 92 kB 4.9 MB/s eta 0:00:01[K     |████████████▉                   | 102 kB 4.1 MB/s eta 0:00:01[K     |██████████████                  | 112 kB 4.1 MB/s eta 0:00:01[K     |███████████████▍                | 122 kB 4.1 MB/s eta 0:00:01[K     |████████████████▋               | 133 kB 4.1 MB/s eta 0:00:01[K     |██████████████████              | 143 kB 4.1 MB/s eta 0:00:01[K    

Unnamed: 0,info
0,How to Get Rid of Acne Scars on Your Chest
1,How to Get a Copy of Your Approved I‐140 Notice
2,How to Get Sweat Stains out of Clothing
3,How to Recognize the Warning Signs of a Stroke
4,How to Manage a Restaurant


# Utils

In [None]:
def tokenize(text):
    """Tokenizing and creating excerpts in the format suggested in the README of readability project."""
    return '\n\n'.join(
        '\n'.join(
            ' '.join(token.value for token in sentence)
            for sentence in paragraph)
        for paragraph in segmenter.analyze(text))

In [None]:
def create_readability_object_full_text(article):
    text = article.title
    text += '\n\n' + article.intro
    for method in article.methods:
        text += '\n\n' + method.title
        for step in method.steps:
            text += '\n\n' + step.title
            text += '\n\n' + step.description

    return readability.getmeasures(tokenize(text), lang='en')

In [None]:
def create_method_data(article):
    method_data = []
    for method in article.methods:
        text = method.title
        for step in method.steps:
            text += '\n\n' + step.title
            text += '\n\n' + step.description

        method_data.append(readability.getmeasures(tokenize(text), lang='en'))

    return method_data

In [None]:
text_feature_data = data.apply(lambda row: create_readability_object_full_text(row.info), axis=1)
method_data = data.apply(lambda row: create_method_data(row.info), axis=1)

# Structure Features

In [None]:
def method_count(obj):
    """The number of methods."""

    return obj.n_methods

In [None]:
def mean_method_size(obj):
    """The ratio between characters in methods to the method count."""

    return sum(method['sentence info']['characters'] for method in obj)/len(obj)

In [None]:
def mean_paragraph_size(obj):
    """The ratio between characters in paragraphs to the paragraph count."""

    text = obj.intro
    num_para = 1
    for method in obj.methods:
        for step in method.steps:
            text += '\n\n' + step.description
            num_para += 1

    return readability.getmeasures(tokenize(text), lang='en')['sentence info']['characters']/num_para

In [None]:
def size_largest_method(obj):
    """The number of characters in the largest method."""

    return max(method['sentence info']['characters'] for method in obj)

In [None]:
def size_shortest_method(obj):
    """The number of characters in the shortes method."""

    return min(method['sentence info']['characters'] for method in obj)

In [None]:
def std_method_size(obj):
    """The standard deviation of number of characters in methods."""

    return np.std([method['sentence info']['characters'] for method in obj])

In [None]:
def step_count(obj):
    """The number of steps."""

    num_steps = 0
    for method in obj.methods:
        for step in method.steps:
            num_steps += 1

    return num_steps

In [None]:
def introduction_size(obj):
    """The number of characters in introduction."""

    text = obj.intro

    return readability.getmeasures(tokenize(text), lang='en')['sentence info']['characters']

In [None]:
def summary_size(obj):
    """The number of characters in summary."""

    text = obj.summary

    if text is None:
        return 0
    return readability.getmeasures(tokenize(text), lang='en')['sentence info']['characters']

In [None]:
def references_count(obj):
    """The number of references."""

    return obj.references

In [None]:
def references_count_per_text_length(obj):
    """The number of references divided by the character count."""

    return obj.references / obj['sentence info']['characters']

In [None]:
def references_count_per_method(obj):
    """The number of references divided by the method count."""

    return obj.references / obj.n_methods

In [None]:
def image_count(obj):
    """The number of images in a article."""
    
    images = 0
    for method in obj.methods:
        for step in method.steps:
            if step.picture is not None:
                images += 1

    return images

# Stability Features

In [None]:
def num_votes(obj):
    """The number of people who rated the article."""

    return obj.num_votes

In [None]:
def is_expert(obj):
    """If the article is written by an expert author."""

    return obj.is_expert

In [None]:
def views(obj):
    """The number of views of the article."""

    return obj.views

In [None]:
def co_authors(obj):
    """The number of co-authors of the article."""

    return obj.co_authors

In [None]:
def warnings(obj):
    """The number of warning in the article."""

    return len(obj.warnings)

In [None]:
def tips(obj):
    """The number of tips in the article."""

    return len(obj.tips)

# Style Features

In [None]:
def to_be_verb(obj):
    """The ratio between number of "to be" verb and number of words."""
    
    return obj['word usage']['tobeverb']

In [None]:
def aux_verb(obj):
    """The number of auxiliary verbs."""

    return obj['word usage']['auxverb']

In [None]:
def conjunction(obj):
    """The number of conjunctions."""

    return obj['word usage']['conjunction']

In [None]:
def pronoun(obj):
    """The number of pronouns."""

    return obj['word usage']['pronoun']

In [None]:
def preposition(obj):
    """The number of prepositions."""

    return obj['word usage']['preposition']

In [None]:
def nominalization(obj):
    """The number of nominalizations."""

    return obj['word usage']['nominalization']

In [None]:
def sentence_beginning_pronoun(obj):
    """The number of sentences that start with a pronoun."""

    return obj['sentence beginnings']['pronoun']

In [None]:
def sentence_beginning_interrogative(obj):
    """The number of sentences that start with an interrogative."""

    return obj['sentence beginnings']['interrogative']

In [None]:
def sentence_beginning_article(obj):
    """The number of sentences that start with an article."""

    return obj['sentence beginnings']['article']

In [None]:
def sentence_beginning_subordination(obj):
    """The number of sentences that start with a subordination."""

    return obj['sentence beginnings']['subordination']

In [None]:
def sentence_beginning_conjunction(obj):
    """The number of sentences that start with a conjunction."""

    return obj['sentence beginnings']['conjunction']

In [None]:
def sentence_beginning_preposition(obj):
    """The number of sentences that start with a preposition."""

    return obj['sentence beginnings']['preposition']

# Readability Features

In [None]:
def Kincaid(obj):
    """Indicate the difficulty in understanding a passage in english."""

    return obj['readability grades']['Kincaid']

In [None]:
def ARI(obj):
    """The Automated Readability Index (ARI) is a readability test designed to assess the understandability of a text."""

    return obj['readability grades']['ARI']

In [None]:
def Coleman_Liau(obj):
    """The Coleman–Liau index is a readability test to gauge the understandability of a text."""

    return obj['readability grades']['Coleman-Liau']

In [None]:
def FleschReadingEase(obj):
    """Measures the readibility of a text."""

    return obj['readability grades']['FleschReadingEase']

In [None]:
def GunningFogIndex(obj):
    """Gives the number of years of education that a reader hypothetically needs to understand the paragraph or text."""

    return obj['readability grades']['GunningFogIndex']

In [None]:
def LIX(obj):
    """Readability measure to reveal the difficulty of reading a foreign text"""

    return obj['readability grades']['LIX']

In [None]:
def SMOGIndex(obj):
    """Measures how many years of education the average person needs to have to understand a text."""

    return obj['readability grades']['SMOGIndex']

In [None]:
def RIX(obj):
    """The ratio between characters in paragraphs to the paragraph count."""

    return obj['readability grades']['RIX']

In [None]:
def DaleChallIndex(obj):
    """It is a readability test that provides a numeric gauge of the comprehension difficulty that readers come upon when reading a text."""

    return obj['readability grades']['DaleChallIndex']

# Length Features

In [None]:
def character_count(obj):
    """The number of characters in text."""
    
    return obj['sentence info']['characters']

In [None]:
def word_count(obj):
    """The number of words in the text."""
    
    return obj['sentence info']['words']

# Target

In [None]:
def percent_helpful(obj):
    """Percent of people which found the article to be helpful."""

    return obj.percent_helpful

# Creating Features

In [None]:
final = pd.DataFrame()

# Length Features
final['character_count'] = text_feature_data.apply(character_count)
final['word_count'] = text_feature_data.apply(word_count)

# Structure Features
final['method_count'] = data.apply(lambda row: method_count(row.info), axis=1)
final['mean_method_size'] = method_data.apply(mean_method_size)
final['mean_paragraph_size'] = data.apply(lambda row: mean_paragraph_size(row.info), axis=1)
final['size_largest_method'] = method_data.apply(size_largest_method)
final['size_shortest_method'] = method_data.apply(size_shortest_method)
final['std_method_size'] = method_data.apply(std_method_size)
final['step_count'] = data.apply(lambda row: step_count(row.info), axis=1)
final['mean_steps_per_method'] = final['step_count']/final['method_count']
final['introduction_size'] = data.apply(lambda row: introduction_size(row.info), axis=1)
final['summary_size'] = data.apply(lambda row: summary_size(row.info), axis=1)
final['references_count'] = data.apply(lambda row: references_count(row.info), axis=1)
final['references_count_per_text_length'] = final['references_count']/final['character_count']
final['references_count_per_method'] = final['references_count']/final['method_count']
final['image_count'] = data.apply(lambda row: image_count(row.info), axis=1)
final['image_count_per_method'] = final['image_count'] / final['method_count']

# Stability Features
final['num_votes'] = data.apply(lambda row: num_votes(row.info), axis=1)
final['is_expert'] = data.apply(lambda row: is_expert(row.info), axis=1)
final['views'] = data.apply(lambda row: views(row.info), axis=1)
final['co_authors'] = data.apply(lambda row: co_authors(row.info), axis=1)
final['warnings'] = data.apply(lambda row: warnings(row.info), axis=1)
final['tips'] = data.apply(lambda row: tips(row.info), axis=1)

# Style Features
final['to_be_verb'] = text_feature_data.apply(to_be_verb)
final['aux_verb'] = text_feature_data.apply(aux_verb)
final['conjunction'] = text_feature_data.apply(conjunction)
final['pronoun'] = text_feature_data.apply(pronoun)
final['preposition'] = text_feature_data.apply(preposition)
final['nominalization'] = text_feature_data.apply(nominalization)
final['sentence_beginning_pronoun'] = text_feature_data.apply(sentence_beginning_pronoun)
final['sentence_beginning_interrogative'] = text_feature_data.apply(sentence_beginning_interrogative)
final['sentence_beginning_article'] = text_feature_data.apply(sentence_beginning_article)
final['sentence_beginning_subordination'] = text_feature_data.apply(sentence_beginning_subordination)
final['sentence_beginning_conjunction'] = text_feature_data.apply(sentence_beginning_conjunction)
final['sentence_beginning_preposition'] = text_feature_data.apply(sentence_beginning_preposition)

# Readability Features
final['Kincaid'] = text_feature_data.apply(Kincaid)
final['ARI'] = text_feature_data.apply(ARI)
final['Coleman_Liau'] = text_feature_data.apply(Coleman_Liau)
final['FleschReadingEase'] = text_feature_data.apply(FleschReadingEase)
final['GunningFogIndex'] = text_feature_data.apply(GunningFogIndex)
final['LIX'] = text_feature_data.apply(LIX)
final['SMOGIndex'] = text_feature_data.apply(SMOGIndex)
final['RIX'] = text_feature_data.apply(RIX)
final['DaleChallIndex'] = text_feature_data.apply(DaleChallIndex)

# Target
final['percent_helpful'] = data.apply(lambda row: percent_helpful(row.info), axis=1)

In [None]:
final.head()

Unnamed: 0,character_count,word_count,method_count,mean_method_size,mean_paragraph_size,size_largest_method,size_shortest_method,std_method_size,step_count,mean_steps_per_method,...,Kincaid,ARI,Coleman_Liau,FleschReadingEase,GunningFogIndex,LIX,SMOGIndex,RIX,DaleChallIndex,percent_helpful
0,7107,1508,2,3357.5,379.823529,4321,2394,963.5,16,8.0,...,7.52376,8.15975,9.908877,67.453457,10.820887,36.866542,10.376433,3.264706,10.086871,78
1,5180,1176,2,2419.5,317.0,3171,1668,751.5,14,7.0,...,4.951055,5.505902,7.708411,81.614246,8.897157,33.807519,9.052403,2.652632,9.111023,99
2,3822,902,3,1114.333333,263.333333,1521,881,288.598144,11,3.666667,...,6.020412,6.72745,7.309495,81.109399,9.797251,31.8102,9.310165,2.527273,8.038576,88
3,11614,2429,3,3604.333333,393.111111,4148,2832,561.058721,26,8.666667,...,7.798307,8.728719,10.376311,66.362137,11.577962,38.743177,10.914639,3.584906,9.965255,82
4,9633,2004,4,2310.5,535.411765,2753,1922,329.19637,16,4.0,...,8.716429,9.423549,10.661749,61.827048,13.316999,42.773535,12.116729,4.327869,9.123631,100


In [None]:
final.shape

(19917, 45)

In [None]:
final.to_csv('wikihow.csv', index=False)