In [213]:
from __future__ import print_function
import json
import string
import gensim
import itertools
import numpy as np
from nltk.util import ngrams
from text2num import text2num
from collections import Counter
from nltk.corpus import stopwords
from textstat.textstat import textstat
from nltk import word_tokenize, pos_tag

# Feature Engineering
The following notebook will focus on generating as much hand-crafted features as possible through various methods and techniques. This is simply a workbech we will be using to define each of the function. In the end we hope to parallelize this process to increase the processing speed of this preprocessing phase.

We will follow through this also by performing an analysis on a corresponding feature importance analysis to evaluate whether to utilize them or not.

In [2]:
# Application Directory Constants
DATA_DIR = '../Data/dataset/'
MODEL_DIR = '../Models/'

# Load Dataset
instance_raw = open(DATA_DIR+'instances_train.jsonl', 'rb').read().replace('\"', '"').split('\n')[:-1]
train_X = map(json.loads, instance_raw)

truth_raw = open(DATA_DIR+'truth_train.jsonl', 'rb').read().replace('\"', '"').split('\n')[:-1]
train_Y = map(json.loads, truth_raw)

**1. Number of NNP**

In [20]:
nnp_num = lambda x: sum(map(lambda y: y[1] == 'NNP', pos_tag(word_tokenize(x))))

In [21]:
print(nnp_num(train_X[0]['targetTitle']))

1


**2. Readability of Target Paragraphs**

TextStat package provides us with a huge bulk of useful metrics we can try.

Source: https://github.com/shivam5992/textstat

In [50]:
# Other Features to Try Out
dir(textstat)

['__doc__',
 '__init__',
 '__module__',
 'automated_readability_index',
 'avg_letter_per_word',
 'avg_sentence_length',
 'avg_sentence_per_word',
 'avg_syllables_per_word',
 'char_count',
 'coleman_liau_index',
 'dale_chall_readability_score',
 'difficult_words',
 'flesch_kincaid_grade',
 'flesch_reading_ease',
 'gunning_fog',
 'lexicon_count',
 'linsear_write_formula',
 'polysyllabcount',
 'sentence_count',
 'smog_index',
 'syllable_count',
 'text_standard']

In [51]:
print(textstat.flesch_reading_ease(train_X[0]['targetTitle']))
print(textstat.smog_index(train_X[0]['targetTitle']))
print(textstat.flesch_kincaid_grade(train_X[0]['targetTitle']))
print(textstat.coleman_liau_index(train_X[0]['targetTitle']))
print(textstat.automated_readability_index(train_X[0]['targetTitle']))
print(textstat.dale_chall_readability_score(train_X[0]['targetTitle']))
print(textstat.difficult_words(train_X[0]['targetTitle']))
print(textstat.linsear_write_formula(train_X[0]['targetTitle']))
print(textstat.gunning_fog(train_X[0]['targetTitle']))
print(textstat.text_standard(train_X[0]['targetTitle'])) # TODO: Covert this to numerical rep.

54.22
0
9.9
12.12
11.2
10.98
7
8.5
8.8
8th and 9th grade


**3. Number of Tokens**

In [52]:
len(word_tokenize(train_X[0]['targetTitle']))

20

**4. Word Length of Post Text**

(Proposed Alternative: Word Length of Target Title)

In [69]:
wlen_title = lambda text: len(filter(lambda x: x.isalpha(), word_tokenize(text)))

In [70]:
wlen_title(train_X[0]['targetTitle'])

15

**5. POS 2-gram NNP NNP**

In [50]:
# General POS_2GRAM POS Identifier Function
def pos_2gram(title, p1, p2):
    pos_list = pos_tag(word_tokenize(title))
    return sum(map(lambda x: x[0][1] == p1 and x[1][1] == p2, zip(pos_list[:-1], pos_list[1:])))

In [97]:
pos_2gram(train_X[2]['targetTitle'], 'NNP', 'NNP')

2

**6. Whether Post Starts With Number**

Here we use either numerical numbers or text based numbers to account for those possibilities.

We use a pre-written library to help convert text to numbers and check if they are in fact numbers.

In [73]:
def is_numeric(text):
    try: return type(text2num(text)) == type(0)
    except Exception as e: return False
    
num_start = lambda x: x[0].isdigit() or is_numeric(x[0])

In [84]:
int(num_start(train_X[2]['targetTitle']))

0

**7. Average Length of Words in Post**

In [75]:
def mean_wordlen(text):
    word_lens = map(lambda x: map(lambda y: len(y), word_tokenize(x)), text)
    return np.mean(list(itertools.chain.from_iterable(word_lens)))

In [76]:
mean_wordlen(train_X[0]['targetParagraphs'])

4.2788339670468947

**8. Number of IN**

In [82]:
in_num = lambda t: sum(map(lambda x: x[1] == 'IN', pos_tag(word_tokenize(t))))

In [83]:
in_num(train_X[2]['targetTitle'])

1

**9. POS 2-gram NNP VBZ**

In [99]:
pos_2gram(train_X[0]['targetTitle'], 'NNP', 'VBZ')

1

**10. POS 2-gram IN NNP**

In [100]:
pos_2gram(train_X[2]['targetTitle'], 'IN', 'NNP')

1

**11. Length of the Longest Word in Post Text**

In [102]:
def max_wordlen(text):
    word_lens = map(lambda x: map(lambda y: len(y), word_tokenize(x)), text)
    return np.max(list(itertools.chain.from_iterable(word_lens)))

In [103]:
max_wordlen(train_X[0]['postText'])

8

**12. Number of WRB**

In [120]:
wrb_num = lambda t: sum(map(lambda x: x[1] == 'WRB', pos_tag(word_tokenize(t))))

In [127]:
wrb_num(train_X[0]['targetTitle'])

0

**13. Count POS Pattern WRB** (Potential redundancy with 12)

**14. Number of NN**

In [128]:
nnp_num = lambda t: sum(map(lambda x: x[1] == 'NN', pos_tag(word_tokenize(t))))

In [129]:
nnp_num(train_X[0]['targetTitle'])

5

**15. Count POS Pattern NN** (Potential redundancy with 14)

**16. Whether post text start with 5W1H**

In [134]:
wh_start = lambda t: word_tokenize(t)[0].lower() in ['who', 'what', 'why', 'where', 'when', 'how']

In [135]:
wh_start(train_X[0]['targetTitle'])

False

**17. Whehter exist QM**

In [136]:
qm_exist = lambda t: sum(map(lambda x: str(x) == '?', word_tokenize(t))) > 0

In [137]:
qm_exist(train_X[0]['targetTitle'])

False

**18. Similarity Between Post and Target Title**

We utilized a cosine-similarity based approach based on pretrained word vectors from spaCy rather than the one reported by the original paper.

In [None]:
# dir(gensim.models.doc2vec.Doc2Vec)
model = gensim.models.KeyedVectors.load_word2vec_format(MODEL_DIR+'GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
def avg_sim(title, body):
    title = nlp(title)
    return np.mean(map(lambda x: title.similarity(nlp(x)), body))

In [75]:
avg_sim(train_X[0]['targetTitle'], train_X[0]['targetParagraphs'])

0.81674047558606366

**19. Count POS pattern this/these NN**

In [12]:
def pos_thnn(title):
    pos_list = pos_tag(word_tokenize(title))
    return sum(map(lambda x: (x[0][0].lower() == 'this' or x[0][0].lower() == 'these') and x[1][1] == 'NN', zip(pos_list[:-1], pos_list[1:])))

In [13]:
pos_thnn('You wont believe this cat did')

1

**20. Count POS pattern PRP (???)**

**21. Number of PRP**

In [20]:
prp_count = lambda t: sum(map(lambda x: x[1] == 'PRP', pos_tag(word_tokenize(t))))

In [22]:
prp_count(train_X[0]['targetTitle'])

0

**22. Number of VBZ**

In [23]:
vbz_count = lambda t: sum(map(lambda x: x[1] == 'VBZ', pos_tag(word_tokenize(t))))

In [24]:
vbz_count(train_X[0]['targetTitle'])

1

**23. POS 3-Gram NNP NNP VBZ**

In [45]:
# General 3-Gram POS Structure
# General POS_2GRAM POS Identifier Function
def pos_3gram(title, p1, p2, p3):
    pos_list = pos_tag(word_tokenize(title))
    return sum(map(lambda x: x[0][0][1] == 'NNP' and x[0][1][1] == 'NNP' and x[1][1] == 'VBZ', zip(zip(pos_list[:-1], pos_list[1:]), pos_list[2:])))

In [48]:
pos_3gram(train_X[1]['targetTitle'], 'NNP', 'NNP', 'VBZ')

0

**24. POS 2-gram NN IN**

In [51]:
pos_2gram(train_X[2]['targetTitle'], 'NN', 'IN')

0

**25. POS 3-gram NN IN NNP**

In [53]:
pos_3gram(train_X[2]['targetTitle'], 'NN', 'IN', 'NNP')

0

**26. Ratio of stop words in post text**

In [59]:
sw_ratio = lambda t: sum(map(lambda x: x.lower() in stopwords.words('english'), word_tokenize(t))) / float(len(word_tokenize(t)))

In [61]:
sw_ratio(' '.join(train_X[2]['postText']))

0.13333333333333333

**27. POS 2-Gram NNP .**

In [63]:
pos_2gram(train_X[2]['targetTitle'], 'NNP', '.')

0

**28. POS 2-Gram PRP VBP**

In [64]:
pos_2gram(train_X[2]['targetTitle'], 'PRP', 'VBP')

0

**29. Count POS pattern WP** ??

**30. Number of WP**

In [65]:
wp_count = lambda t: sum(map(lambda x: x[1] == 'WP', pos_tag(word_tokenize(t))))

In [66]:
wp_count(train_X[0]['targetTitle'])

0

**31. Count POS pattern DT**

**32. Number of DT**

In [67]:
dt_count = lambda t: sum(map(lambda x: x[1] == 'DT', pos_tag(word_tokenize(t))))

In [68]:
dt_count(train_X[0]['targetTitle'])

1

**33. POS 2-Gram NNP IN**

In [70]:
pos_2gram(train_X[0]['targetTitle'], 'NNP', 'IN')

0

**34. POS 3-gram IN NNP NNP**

In [71]:
pos_3gram(train_X[0]['targetTitle'], 'IN', 'NNP', 'NNP')

0

**35. Number of POS**

In [73]:
pos_count = lambda t: sum(map(lambda x: x[1] == 'POS', pos_tag(word_tokenize(t))))

In [74]:
pos_count(train_X[0]['targetTitle'])

1

**36. POS 2-Gram IN NN**

In [75]:
pos_2gram(train_X[0]['targetTitle'], 'IN', 'NN')

0

**37. Match Between Keyword and Post**

In [100]:
def kw_post_match(kw, post):
    return len((set(word_tokenize(kw.lower())) - set(word_tokenize(' '.join(post).lower()))) - set(stopwords.words('english')) - set(list(string.punctuation)))

In [101]:
kw_post_match(train_X[0]['targetKeywords'], train_X[0]['postText'])

5

**38. Number of ,**

In [106]:
comma_count = lambda t: len(filter(lambda x: x == ',', word_tokenize(t)))

In [109]:
comma_count(train_X[2]['targetTitle'])

0

**39. POS 2-Gram NNP NNS**

In [111]:
pos_2gram(train_X[2]['targetTitle'], 'NNP', 'NNS')

0

**40. POS 2-Gram IN JJ**

In [112]:
pos_2gram(train_X[0]['targetTitle'], 'IN', 'JJ')

0

**41. POS 2-Gram NNP POS**

In [113]:
pos_2gram(train_X[0]['targetTitle'], 'NNP', 'POS')

0

**42. Number of WDT**

In [115]:
wdt_count = lambda t: sum(map(lambda x: x[1] == 'WDT', pos_tag(word_tokenize(t))))

In [116]:
wdt_count(train_X[0]['targetTitle'])

0

**43. Count POS pattern WDT** ??

**44. POS 2-gram NN NN**

In [120]:
pos_2gram(train_X[2]['targetTitle'], 'NN', 'NN')

0

**45. POS 2-gram NN NNP**

In [119]:
pos_2gram(train_X[2]['targetTitle'], 'NN', 'NNP')

0

**46. POS 2-gram NNP VBD**

In [121]:
pos_2gram(train_X[0]['targetTitle'], 'NNP', 'VBD')

0

**47. Similarity Between Post and Target Paragraphs**

**48. Count POS pattern RB** ??

**49. Number of RB**

In [129]:
rb_count = lambda t: sum(map(lambda x: x[1] == 'RB', pos_tag(word_tokenize(t))))

In [130]:
rb_count(train_X[2]['targetTitle'])

0

**50. POS 3-gram NNP NNP NNP**

In [138]:
pos_3gram(train_X[2]['targetTitle'], 'NNP', 'NNP', 'NNP')

0

**51. POS 3-gram NNP NNP NN**

In [139]:
pos_3gram(train_X[2]['targetTitle'], 'NNP', 'NNP', 'NN')

0

**51. Number of RBS**

In [134]:
rbs_count = lambda t: sum(map(lambda x: x[1] == 'RBS', pos_tag(word_tokenize(t))))

In [135]:
rbs_count(train_X[2]['targetTitle'])

0

**52. Number of VBN**

In [140]:
vbn_count = lambda t: sum(map(lambda x: x[1] == 'VBN', pos_tag(word_tokenize(t))))

In [141]:
vbn_count(train_X[2]['targetTitle'])

0

**53. POS 2-gram VBN IN**

In [142]:
pos_2gram(train_X[0]['targetTitle'], 'VBN', 'IN')

0

**54. Whether exists NUMBER NP VB** ??

**55. POS 2-Gram JJ NNP**

In [153]:
pos_2gram(train_X[0]['targetTitle'], 'JJ', 'NNP')

0

**58. POS 3-Gram NNP NN NN**

In [155]:
pos_3gram(train_X[0]['targetTitle'], 'NNP', 'NN', 'NN')

0

**59. POS 2-Gram DT NN**

In [156]:
pos_2gram(train_X[0]['targetTitle'], 'DT', 'NN')

1

**60. Whether exists EX**

In [158]:
ex_exist = lambda t: int(sum(map(lambda x: x[1] == 'EX', pos_tag(word_tokenize(t)))) > 0)

In [159]:
ex_exist(train_X[0]['targetTitle'])

0

**61. RB Ratio in Target Paragraphs**

In [181]:
def pos_text_ratio(text, pos):
    return len(filter(lambda x: x[1] == pos, pos_tag(word_tokenize(' '.join(text))))) / float(len(word_tokenize(' '.join(text))))

In [182]:
pos_text_ratio(train_X[0]['targetParagraphs'], 'RB')

0.04055766793409379

**62. JJ Ratio in Target Paragraphs**

In [183]:
pos_text_ratio(train_X[0]['targetParagraphs'], 'JJ')

0.062103929024081114

**63. VBP Ratio in Target Paragraphs**

In [184]:
pos_text_ratio(train_X[0]['targetParagraphs'], 'VBP')

0.010139416983523447

**63. VBD Ratio in Target Paragraphs**

In [185]:
pos_text_ratio(train_X[0]['targetParagraphs'], 'VBD')

0.020278833967046894

**Ratio of Various POS in Target Paragraphs**

In [210]:
pos_list = ['CC', 'CD', 'DT', 'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'LS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS', 'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'WDT', 'WP', 'WP$', 'WRB']
ratio = lambda t: map(lambda x: pos_text_ratio(t, x), pos_list)

In [212]:
len(ratio(train_X[0]['targetParagraphs']))

36

**Post Text n-gram Features**

In [216]:
def ngram_ptext(text, n):
    return sum(Counter(ngrams(word_tokenize(' '.join(text)), n)).values())

In [217]:
ngram_feat = lambda x: [ngram_ptext(x, i) for i in range(6)]

In [221]:
ngram_feat(train_X[0]['targetParagraphs'])

[789, 789, 788, 787, 786, 785]