# Baseline Model

In [23]:
from __future__ import print_function
import sys
import string
import numpy as np
from nltk import word_tokenize
from nltk.tag import pos_tag_sents
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn import metrics as met

sys.path.append('..')
from text2num import text2num
from util.load_data import JSONData
from util.scores import ScoreReport

import warnings
warnings.filterwarnings("ignore")

**Constants & Parameters**

In [3]:
''' Constants and Parameters '''
DATA_ROOT = '../../Data/dataset/'       # Root Folder of where Dataset Resides
MODEL_ROOT = '../../Models/dataset/'    # Root Folder of where Model Resides
K_FOLD = 10
SHUFFLE_FOLDS = True
np.random.seed(9892)                    # Seed Parameter for PRNG

**Import and Load Data**

In [4]:
''' Import Data '''
# Load Dataset
data_load = JSONData(DATA_ROOT+'instances_train.jsonl', DATA_ROOT+'truth_train.jsonl', DATA_ROOT+'instances_test.jsonl')
train_X = data_load.load_train_X()
train_Y = data_load.load_train_Y()

### Data Preprocessing & Feature Engineering

In [5]:
def is_numeric(text):
    try: return type(text2num(text)) == type(0)
    except Exception as e: return False

def preprocess(text):
    text = word_tokenize(text.lower())                                      # Tokenize & Normalize Text
    text = filter(lambda x: x not in string.punctuation, text)              # Remove Punctuation

    # Perform Feature Extraction
    word_count = len(text)                                                  # Total Word Count
    avg_word_len = sum(map(lambda x: len(x), text))/float(len(text))        # Average Word Length
    max_word_len = max(map(lambda x: len(x), text))                         # Longest Word Length
    is_number = text[0].isdigit() or is_numeric(text[0])                    # Check if starts with number (Either Numerically or Linguistically)
    start_ws = text[0] in ['who', 'what', 'why', 'where', 'when', 'how']    # Whether it starts with question word

    return [word_count, avg_word_len, max_word_len, int(is_number), int(start_ws)]

In [6]:
# Finalize Feature and Target Vectors
X = np.array(map(lambda x: preprocess(x['targetTitle']), train_X))
Y = np.array(map(lambda x: [0] if x['truthClass'] == 'no-clickbait' else [1], train_Y))
Y_ = np.array(map(lambda x: 0 if x['truthClass'] == 'no-clickbait' else 1, train_Y))

### Train Model

In [21]:
# Alternative Score Reporting Function
def report_score(y_true, y_pred, y_prob):
    print("Mean squared error: " + str(met.mean_squared_error(y_true, y_pred)))
    print("R^2: " + str(met.r2_score(y_true, y_pred)))
    print()

In [22]:
# K-Fold and Score Tracking
kf = StratifiedKFold(n_splits=K_FOLD, shuffle=SHUFFLE_FOLDS)

print('Training Model...')
for i, (train_idx, test_idx) in enumerate(kf.split(X, Y_)):
    print('\n[K = ' + str(i+1) + ']')
    ''' SMOTE - Generate Synthetic Data '''
    # sm = SMOTE(kind='regular')
    # X_resampled = []
    # X_res, Y_res = sm.fit_sample(X[train_idx], Y[train_idx])

    # Train Model
    gnb = GaussianNB()
    gnb.fit(X[train_idx], Y[train_idx])

    # Generate Predictions & Confidence Estimates
    y_pred = gnb.predict(X[test_idx])
    y_prob = gnb.predict_proba(X[test_idx])
    y_prob = map(lambda x: x[1][x[0]], zip(y_pred, y_prob))
    
    # Append to Report
    # report.append_result(Y[test_idx].reshape(y_pred.shape), y_pred, y_prob)
    
    # Alternative Score Reporting
    report_score(Y[test_idx].reshape(y_pred.shape), y_pred, y_prob)

Training Model...

[K = 1]
Mean squared error: 0.274587833997
R^2: -0.455138218066


[K = 2]
Mean squared error: 0.281978396816
R^2: -0.494303428904


[K = 3]
Mean squared error: 0.295054007959
R^2: -0.563595725003


[K = 4]
Mean squared error: 0.294084186576
R^2: -0.560198782927


[K = 5]
Mean squared error: 0.287827076223
R^2: -0.527003064141


[K = 6]
Mean squared error: 0.289533560865
R^2: -0.536056441992


[K = 7]
Mean squared error: 0.301478953356
R^2: -0.599430086946


[K = 8]
Mean squared error: 0.270193401593
R^2: -0.433451493018


[K = 9]
Mean squared error: 0.269208878771
R^2: -0.427689648893


[K = 10]
Mean squared error: 0.293682413204
R^2: -0.557479616974

