# Feature extraction

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import feature_extraction

## Load dataset

In [3]:
# Load CB16 dataset
df = pd.read_json("../data/clickbait17-train-170331/instances.jsonl", lines=True, encoding='utf8')
df.set_index("id", inplace=True)
df_raw = df.copy()

df_truth = pd.read_json("../data/clickbait17-train-170331/truth.jsonl", lines=True, encoding='utf8')
df_truth.set_index("id", inplace=True)
df_truth_raw = df_truth.copy()

In [4]:
# Add truth judgments to df
df = df.join(df_truth[["truthClass"]]) # also available: truthClass, truthJudgments, truthMean, truthMedian, truthMode

In [5]:
df.head(2)
# len(df)

Unnamed: 0_level_0,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle,truthClass
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
608310377143799808,[],[Apple's iOS 9 'App thinning' feature will giv...,Tue Jun 09 16:31:10 +0000 2015,['App thinning' will be supported on Apple's i...,'App thinning' will be supported on Apple's iO...,"Apple,gives,gigabytes,iOS,9,app,thinning,featu...",[Paying for a 64GB phone only to discover that...,Apple gives back gigabytes: iOS 9 'app thinnin...,no-clickbait
609297109095972864,[media/609297109095972864.jpg],[RT @kenbrown12: Emerging market investors are...,Fri Jun 12 09:52:05 +0000 2015,"[Stocks Fall as Investors Watch Central Banks,...",Global investors have yanked $9.3 billion from...,"emerging market,emerging markets,em flows,em i...","[Emerging markets are out of favor., Global in...",Emerging Markets Suffer Largest Outflow in Sev...,no-clickbait


## Extract features

In [6]:
fe = feature_extraction.FeatureExtractor(r"..\data\clickbait17-train-170331", None) 
fe.set_df(df)

In [7]:
labels, features = fe.extract_features()

In [15]:
print(features[:2])
print(labels[:2])

                    numChars_PostTitle  ratioChars_PostImagePostTitle  \
id                                                                      
608310377143799808                63.0                      -1.000000   
609297109095972864                85.0                       2.352941   

                    diffChars_PostTitleArticleKeywords  \
id                                                       
608310377143799808                                15.0   
609297109095972864                                35.0   

                    diffChars_PostTitlePostImage  \
id                                                 
608310377143799808                          -1.0   
609297109095972864                         115.0   

                    ratioWords_PostImagePostTitle  numWords_PostTitle  \
id                                                                      
608310377143799808                           -1.0                13.0   
609297109095972864                            

## Save the features and labels

In [14]:
import pickle

features.to_pickle("../features/1_paper_features.pkl")
labels.dump(open('../features/1_paper_labels.pkl', 'wb'))

In [19]:
# Check if items are the same
import numpy

f_load = pd.read_pickle("../features/1_paper_features.pkl")
l_load = numpy.load("../features/1_paper_labels.pkl")

print(f_load[:2])
print(l_load[:2])

                    numChars_PostTitle  ratioChars_PostImagePostTitle  \
id                                                                      
608310377143799808                63.0                      -1.000000   
609297109095972864                85.0                       2.352941   

                    diffChars_PostTitleArticleKeywords  \
id                                                       
608310377143799808                                15.0   
609297109095972864                                35.0   

                    diffChars_PostTitlePostImage  \
id                                                 
608310377143799808                          -1.0   
609297109095972864                         115.0   

                    ratioWords_PostImagePostTitle  numWords_PostTitle  \
id                                                                      
608310377143799808                           -1.0                13.0   
609297109095972864                            