In [130]:
from __future__ import print_function
import json
import spacy
import string
import itertools
import numpy as np
from text2num import text2num
from textstat.textstat import textstat

# Feature Engineering
The following notebook will focus on generating as much hand-crafted features as possible through various methods and techniques. This is simply a workbech we will be using to define each of the function. In the end we hope to parallelize this process to increase the processing speed of this preprocessing phase.

We will follow through this also by performing an analysis on a corresponding feature importance analysis to evaluate whether to utilize them or not.

In [2]:
# Application Directory Constants
DATA_DIR = '../Data/dataset/'

# Load Dataset
instance_raw = open(DATA_DIR+'instances_train.jsonl', 'rb').read().replace('\"', '"').split('\n')[:-1]
train_X = map(json.loads, instance_raw)

truth_raw = open(DATA_DIR+'truth_train.jsonl', 'rb').read().replace('\"', '"').split('\n')[:-1]
train_Y = map(json.loads, truth_raw)

# Load spaCy Object
nlp = spacy.load('en')

**1. Number of NNP**

In [69]:
nnp_num = lambda t: sum(map(lambda x: x.pos_ == 'PROPN', nlp(t)))

In [72]:
print(nnp_num(train_X[0]['targetTitle']))

1


**2. Readability of Target Paragraphs**

TextStat package provides us with a huge bulk of useful metrics we can try.

Source: https://github.com/shivam5992/textstat

In [88]:
print(textstat.flesch_reading_ease(train_X[0]['targetTitle']))
print(textstat.smog_index(train_X[0]['targetTitle']))
print(textstat.flesch_kincaid_grade(train_X[0]['targetTitle']))
print(textstat.coleman_liau_index(train_X[0]['targetTitle']))
print(textstat.automated_readability_index(train_X[0]['targetTitle']))
print(textstat.dale_chall_readability_score(train_X[0]['targetTitle']))
print(textstat.difficult_words(train_X[0]['targetTitle']))
print(textstat.linsear_write_formula(train_X[0]['targetTitle']))
print(textstat.gunning_fog(train_X[0]['targetTitle']))
print(textstat.text_standard(train_X[0]['targetTitle'])) # TODO: Covert this to numerical rep.

54.22
0
9.9
12.12
11.2
10.98
7
8.5
8.8
8th and 9th grade


**3. Number of Tokens**

In [95]:
len(nlp(train_X[0]['targetTitle']))

21

**4. Word Length of Post Text**

In [98]:
sum(map(lambda x: len(nlp(x)), train_X[0]['targetParagraphs']))

861

**5. POS 2-gram NNP NNP**

In [141]:
def pos2_nnp_nnp(title):
    pos_list = [i.pos_ for i in nlp(title)]
    return sum(map(lambda x: x[0] == x[1] and x[0] == 'PROPN', zip(pos_list[:-1], pos_list[1:])))

In [142]:
pos2_nnp_nnp(train_X[2]['targetTitle'])

2

**6. Whether Post Starts With Number**

Here we use either numerical numbers or text based numbers to account for those possibilities.

We use a pre-written library to help convert text to numbers and check if they are in fact numbers.

In [117]:
def is_numeric(text):
    try: return type(text2num(text)) == type(0)
    except Exception as e: return False
    
num_start = lambda x: x[0].isdigit() or is_numeric(x[0])

In [118]:
num_start(train_X[2]['targetTitle'])

False

**7. Average Length of Words in Post**

In [134]:
def mean_wordlen(text):
    word_lens = map(lambda x: map(lambda y: len(y), nlp(x)), text)
    return np.mean(list(itertools.chain.from_iterable(word_lens)))

In [135]:
mean_wordlen(train_X[0]['targetParagraphs'])

3.9477351916376309

**8. Number of IN**

In [137]:
in_num = lambda t: sum(map(lambda x: x.pos_ == 'ADP', nlp(t)))

In [140]:
in_num(train_X[0]['targetTitle'])

0

**9. POS 2-gram NNP VBZ**

In [144]:
def pos2_nnp_vbz(title):
    pos_list = [i.pos_ for i in nlp(title)]
    return sum(map(lambda x: x[0] == 'PROPN' and x[1] == 'VERB', zip(pos_list[:-1], pos_list[1:])))

In [147]:
pos2_nnp_vbz(train_X[0]['targetTitle'])

1

**10. POS 2-gram IN NNP**

In [150]:
def pos2_in_nnp(title):
    pos_list = [i.pos_ for i in nlp(title)]
    return sum(map(lambda x: x[0] == 'ADP' and x[1] == 'PROPN', zip(pos_list[:-1], pos_list[1:])))

In [154]:
pos2_in_nnp(train_X[0]['targetTitle'])

0

**11. Length of the Longest Word in Post Text**

In [158]:
def max_wordlen(text):
    word_lens = map(lambda x: map(lambda y: len(y), nlp(x)), text)
    return np.max(list(itertools.chain.from_iterable(word_lens)))

In [164]:
max_wordlen(train_X[0]['postText'])

8

**12. Number of WRB**

In [165]:
wrb_num = lambda t: sum(map(lambda x: x.pos_ == 'ADV', nlp(t)))

In [167]:
wrb_num(train_X[0]['targetTitle'])

2

**13. Count POS Pattern WRB** (Potential redundancy with 12)

**14. Number of NN**

In [168]:
nnp_num = lambda t: sum(map(lambda x: x.pos_ == 'NOUN', nlp(t)))

In [169]:
nnp_num(train_X[0]['targetTitle'])

7

**15. Count POS Pattern NN** (Potential redundancy with 14)

**16. Whether post text start with 5W1H**

In [183]:
wh_start = lambda t: nlp(t)[0].lower_ in ['who', 'what', 'why', 'where', 'when', 'how']

In [184]:
wh_start(train_X[0]['targetTitle'])

False

**17. Whehter exist QM**

In [226]:
qm_exist = lambda t: sum(map(lambda x: str(x) == '?', nlp(t))) > 0

In [228]:
qm_exist(train_X[0]['targetTitle'])

False

**18. Similarity Between Post and Target Title**

We utilized a cosine-similarity based approach based on pretrained word vectors from spaCy rather than the one reported by the original paper.

In [3]:
def avg_sim(title, body):
    title = nlp(title)
    return np.mean(map(lambda x: title.similarity(nlp(x)), body))

In [75]:
avg_sim(train_X[0]['targetTitle'], train_X[0]['targetParagraphs'])

0.81674047558606366