# Preprocess Dataset

Perform OCR and light preprocessing beforehand.

In [57]:
import pandas as pd
import numpy as np
import feature_extraction
import time
from os import path, mkdir

from nltk import download, word_tokenize, pos_tag, WordNetLemmatizer, ngrams
from nltk.data import find
from nltk.corpus import wordnet as wn, stopwords as sw

In [2]:
data_path = "../data"
feat_path = "../features"

tesseract_path = r"%localappdata%\Tesseract-OCR\tesseract"

In [3]:
dataset = "clickbait17-train-170331"

In [45]:
# Load instances
df = pd.read_json(path.join(data_path, dataset, "instances.jsonl"), lines=True, encoding='utf8')
df.set_index("id", inplace=True)

In [115]:
df['postText'] = df['postText'].apply(lambda x: x[0])

In [119]:
df.head(2)

Unnamed: 0_level_0,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
608310377143799808,,Apple's iOS 9 'App thinning' feature will give...,Tue Jun 09 16:31:10 +0000 2015,['App thinning' will be supported on Apple's i...,'App thinning' will be supported on Apple's iO...,"Apple,gives,gigabytes,iOS,9,app,thinning,featu...",[Paying for a 64GB phone only to discover that...,Apple gives back gigabytes: iOS 9 'app thinnin...
609297109095972864,Rushing Out\n\nEmerging markets have suffered ...,RT @kenbrown12: Emerging market investors are ...,Fri Jun 12 09:52:05 +0000 2015,"[Stocks Fall as Investors Watch Central Banks,...",Global investors have yanked $9.3 billion from...,"emerging market,emerging markets,em flows,em i...","[Emerging markets are out of favor., Global in...",Emerging Markets Suffer Largest Outflow in Sev...


## OCR images

In [52]:
ih = feature_extraction.ImageHelper.ImageHelper(path.join(data_path, dataset), tesseract_path)

def get_texts(item):
    if item:
        return ih.get_text(item)
    return item

In [53]:
df['postMedia'] = df['postMedia'].apply(get_texts)

In [103]:
df['postMedia'] = df['postMedia'].apply(lambda x: x if x else "")

# Preprocess and tokenize

In [117]:
def preprocess(sentence):
    
    if not sentence:
        return sentence
    
    if isinstance(sentence, list):
        return [preprocess(x) for x in sentence]
    
    # Convert unrecognized unicode apostrophes back to regular ones
    sentence = sentence.replace("‘", "'").replace("’", "'").replace("“", '"').replace("”", '"')

    # Remove @ and # symbols (which are treated as single words by the NLTK tokenizer)
    sentence = sentence.replace("@", "").replace("#", "")
    
    return sentence

In [123]:
df = df.applymap(preprocess)

In [125]:
df[:2]

Unnamed: 0_level_0,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
608310377143799808,,Apple's iOS 9 'App thinning' feature will give...,Tue Jun 09 16:31:10 +0000 2015,['App thinning' will be supported on Apple's i...,'App thinning' will be supported on Apple's iO...,"Apple,gives,gigabytes,iOS,9,app,thinning,featu...",[Paying for a 64GB phone only to discover that...,Apple gives back gigabytes: iOS 9 'app thinnin...
609297109095972864,Rushing Out\n\nEmerging markets have suffered ...,RT kenbrown12: Emerging market investors are d...,Fri Jun 12 09:52:05 +0000 2015,"[Stocks Fall as Investors Watch Central Banks,...",Global investors have yanked $9.3 billion from...,"emerging market,emerging markets,em flows,em i...","[Emerging markets are out of favor., Global in...",Emerging Markets Suffer Largest Outflow in Sev...


## Write to file

In [127]:
# Write features to file
df.to_pickle(path.join(data_path, dataset, 'instances_processed.pkl'))