# Baseline Model

In [1]:
from __future__ import print_function
import sys
import string
import itertools
import numpy as np
from multiprocessing import Pool
from nltk import word_tokenize, pos_tag
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from textstat.textstat import textstat as ts
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn import metrics as met

sys.path.append('..')
from text2num import text2num
from util.load_data import JSONData
from util.scores import ScoreReport

import warnings
warnings.filterwarnings("ignore")

reload(sys)
sys.setdefaultencoding('utf8')

**Constants & Parameters**

In [2]:
''' Constants and Parameters '''
DATA_ROOT = '../../Data/dataset/'       # Root Folder of where Dataset Resides
MODEL_ROOT = '../../Models/dataset/'    # Root Folder of where Model Resides
K_FOLD = 10
SHUFFLE_FOLDS = True
np.random.seed(9892)                    # Seed Parameter for PRNG

**Import and Load Data**

In [3]:
''' Import Data '''
# Load Dataset
data_load = JSONData(DATA_ROOT+'instances_train.jsonl', DATA_ROOT+'truth_train.jsonl', DATA_ROOT+'instances_test.jsonl')
train_X = data_load.load_train_X()
train_Y = data_load.load_train_Y()

### Data Preprocessing & Feature Engineering

In [6]:
def is_numeric(text):
    try: return type(text2num(text)) == type(0)
    except Exception as e: return False

def pos_2gram(title, p1, p2):
    pos_list = pos_tag(word_tokenize(title))
    return sum(map(lambda x: x[0][1] == p1 and x[1][1] == p2, zip(pos_list[:-1], pos_list[1:])))

def mean_wordlen(text):
    word_lens = map(lambda x: map(lambda y: len(y), word_tokenize(x)), text)
    return np.mean(list(itertools.chain.from_iterable(word_lens)))

def max_wordlen(text):
    word_lens = map(lambda x: map(lambda y: len(y), word_tokenize(x)), text)
    return np.max(list(itertools.chain.from_iterable(word_lens)))

nnp_num = lambda x: sum(map(lambda y: y[1] == 'NNP', pos_tag(word_tokenize(x))))
wlen_title = lambda text: len(filter(lambda x: x.isalpha(), word_tokenize(text)))
num_start = lambda x: x[0].isdigit() or is_numeric(x[0])
in_num = lambda t: sum(map(lambda x: x[1] == 'IN', pos_tag(word_tokenize(t))))
wrb_num = lambda t: sum(map(lambda x: x[1] == 'WRB', pos_tag(word_tokenize(t))))
nnp_num = lambda t: sum(map(lambda x: x[1] == 'NN', pos_tag(word_tokenize(t))))
wh_start = lambda t: word_tokenize(t)[0].lower() in ['who', 'what', 'why', 'where', 'when', 'how']
qm_exist = lambda t: sum(map(lambda x: str(x) == '?', word_tokenize(t))) > 0

def preprocess(x):
    print('PROCESSING ID: ' + str(x['id']))
    fvec = []
    fvec.append(nnp_num(x['targetTitle']))
    if len(x['targetParagraphs']) > 0:
        fvec.append(ts.automated_readability_index(' '.join(x['targetParagraphs'])))
        fvec.append(ts.avg_letter_per_word(' '.join(x['targetParagraphs'])))
        fvec.append(ts.avg_sentence_length(' '.join(x['targetParagraphs'])))
        fvec.append(ts.avg_sentence_per_word(' '.join(x['targetParagraphs'])))
        fvec.append(ts.avg_syllables_per_word(' '.join(x['targetParagraphs'])))
        fvec.append(ts.char_count(' '.join(x['targetParagraphs'])))
        fvec.append(ts.coleman_liau_index(' '.join(x['targetParagraphs'])))
        fvec.append(ts.dale_chall_readability_score(' '.join(x['targetParagraphs'])))
        fvec.append(ts.difficult_words(' '.join(x['targetParagraphs'])))
        fvec.append(ts.flesch_kincaid_grade(' '.join(x['targetParagraphs'])))
        fvec.append(ts.flesch_reading_ease(' '.join(x['targetParagraphs'])))
        fvec.append(ts.gunning_fog(' '.join(x['targetParagraphs'])))
        fvec.append(ts.lexicon_count(' '.join(x['targetParagraphs'])))
        fvec.append(ts.linsear_write_formula(' '.join(x['targetParagraphs'])))
        fvec.append(ts.polysyllabcount(' '.join(x['targetParagraphs'])))
        fvec.append(ts.sentence_count(' '.join(x['targetParagraphs'])))
        fvec.append(ts.smog_index(' '.join(x['targetParagraphs'])))
        fvec.append(ts.syllable_count(' '.join(x['targetParagraphs'])))
        fvec.append(mean_wordlen(x['targetParagraphs']))
    else:
        fvec += [0]*19
    if len(x['postText']) > 0:
        fvec.append(max_wordlen(x['postText']))
    else:
        fvec += [0]*1
    fvec.append(len(word_tokenize(x['targetTitle'])))
    fvec.append(wlen_title(x['targetTitle']))
    fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'NNP'))
    fvec.append(int(num_start(x['targetTitle'])))
    fvec.append(in_num(x['targetTitle']))
    fvec.append(pos_2gram(x['targetTitle'], 'NNP', 'VBZ'))
    fvec.append(pos_2gram(x['targetTitle'], 'IN', 'NNP'))
    fvec.append(wrb_num(x['targetTitle']))
    fvec.append(nnp_num(x['targetTitle']))
    fvec.append(int(wh_start(x['targetTitle'])))
    fvec.append(int(qm_exist(x['targetTitle'])))
    return fvec

In [5]:
# Finalize Feature and Target Vectors
p = Pool(32)
X = np.array(p.map(preprocess, train_X))
Y = np.array(map(lambda x: [0] if x['truthClass'] == 'no-clickbait' else [1], train_Y))
Y_ = np.array(map(lambda x: 0 if x['truthClass'] == 'no-clickbait' else 1, train_Y))
p.close()

ValueError: zero-size array to reduction operation maximum which has no identity

In [8]:
p.close()

### Train Model

In [21]:
# Alternative Score Reporting Function
def report_score(y_true, y_pred, y_prob):
    print("Mean squared error: " + str(met.mean_squared_error(y_true, y_pred)))
    print("R^2: " + str(met.r2_score(y_true, y_pred)))
    print()

In [21]:
# K-Fold and Score Tracking
kf = StratifiedKFold(n_splits=K_FOLD, shuffle=SHUFFLE_FOLDS)

print('Training Model...')
for i, (train_idx, test_idx) in enumerate(kf.split(X, Y_)):
    print('\n[K = ' + str(i+1) + ']')
    ''' SMOTE - Generate Synthetic Data '''
    # sm = SMOTE(kind='regular')
    # X_resampled = []
    # X_res, Y_res = sm.fit_sample(X[train_idx], Y[train_idx])

    # Train Model
    gnb = GaussianNB()
    gnb.fit(X[train_idx], Y[train_idx])

    # Generate Predictions & Confidence Estimates
    y_pred = gnb.predict(X[test_idx])
    y_prob = gnb.predict_proba(X[test_idx])
    y_prob = map(lambda x: x[1][x[0]], zip(y_pred, y_prob))
    
    # Append to Report
    # report.append_result(Y[test_idx].reshape(y_pred.shape), y_pred, y_prob)
    
    # Alternative Score Reporting
    report_score(Y[test_idx].reshape(y_pred.shape), y_pred, y_prob)

Training Model...

[K = 1]


ValueError: setting an array element with a sequence.