# Feature extraction

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import feature_extraction

## Load dataset

In [3]:
# Load CB16 dataset
df = pd.read_json("../data/clickbait17-train-170331/instances.jsonl", lines=True, encoding='utf8')
df.set_index("id", inplace=True)
df_raw = df.copy()

df_truth = pd.read_json("../data/clickbait17-train-170331/truth.jsonl", lines=True, encoding='utf8')
df_truth.set_index("id", inplace=True)
df_truth_raw = df_truth.copy()

In [4]:
# Add truth judgments to df
df = df.join(df_truth[["truthClass"]]) # also available: truthClass, truthJudgments, truthMean, truthMedian, truthMode

In [37]:
df.head(2)
# len(df)

Unnamed: 0_level_0,postMedia,postText,postTimestamp,targetCaptions,targetDescription,targetKeywords,targetParagraphs,targetTitle,truthClass
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
608310377143799808,[],[Apple's iOS 9 'App thinning' feature will giv...,Tue Jun 09 16:31:10 +0000 2015,['App thinning' will be supported on Apple's i...,'App thinning' will be supported on Apple's iO...,"Apple,gives,gigabytes,iOS,9,app,thinning,featu...",[Paying for a 64GB phone only to discover that...,Apple gives back gigabytes: iOS 9 'app thinnin...,no-clickbait
609297109095972864,[media/609297109095972864.jpg],[RT @kenbrown12: Emerging market investors are...,Fri Jun 12 09:52:05 +0000 2015,"[Stocks Fall as Investors Watch Central Banks,...",Global investors have yanked $9.3 billion from...,"emerging market,emerging markets,em flows,em i...","[Emerging markets are out of favor., Global in...",Emerging Markets Suffer Largest Outflow in Sev...,no-clickbait


## Extract features

In [6]:
fe = feature_extraction.FeatureExtractor(r"..\data\clickbait17-train-170331", None) 
fe.set_df(df)

In [7]:
labels, features = fe.extract_features()

In [50]:
print(features[:2])
print(labels[:2])

                    numChars_PostTitle  ratioChars_PostImagePostTitle  \
id                                                                      
608310377143799808                63.0                      -1.000000   
609297109095972864                85.0                       2.352941   

                    diffChars_PostTitleArticleKeywords  \
id                                                       
608310377143799808                                15.0   
609297109095972864                                35.0   

                    diffChars_PostTitlePostImage  \
id                                                 
608310377143799808                          -1.0   
609297109095972864                         115.0   

                    ratioWords_PostImagePostTitle  numWords_PostTitle  \
id                                                                      
608310377143799808                           -1.0                13.0   
609297109095972864                            

## Classification stuff

In [11]:
import classification
from sklearn.ensemble import RandomForestClassifier

# Define the classifiers and the grid that we want to optimize on
# Note: either define a grid and call optimize, or supply optimized_param
classifiers = [
    {
        'name': 'RandomForest',
        'clf': RandomForestClassifier(),
#         'grid': {
#             'n_estimators': [100, 1000],
#             'max_depth': [2, 3]
#         },
        'optimized_param':{
            'max_depth': 3,
            'n_estimators': 1000
        }
    },
]

classi = classification.Classifiers(features,labels, classifiers)

In [47]:
standard_scaled = classi.standard_scaling()
minmax_scaled = classi.minmax_scaling()
robust_scaled = classi.robust_scaling()
print("No scaling")
classi.information_gain()
# classi.chi2_stats()

print("Standard scaling")
classi.information_gain(standard_scaled)
# classi.chi2_stats(standard_scaled)

print("MinMax scaling")
classi.information_gain(minmax_scaled)
# classi.chi2_stats(minmax_scaled)
    
print("Robust scaling")
classi.information_gain(robust_scaled)
# classi.chi2_stats(robust_scaled)
# TODO: chi2 requires non-negative feature values -> we have -1 in the set
# classi.chi2_stats()

No scaling
Information gain of whole dataset
                               Feature Name  Info Gain
1     ratioChars_ArticleParagraphsPostTitle   0.040017
2                  numFormalWords_PostTitle   0.024087
3           ratioChars_ArticleDescPostTitle   0.021760
4                        numWords_PostTitle   0.017788
5   ratioChars_ArticleParagraphsArticleDesc   0.014178
6          ratioChars_ArticleTitlePostTitle   0.010832
7                        numChars_PostTitle   0.008457
8        diffWords_PostTitleArticleKeywords   0.005295
9        diffChars_PostTitleArticleKeywords   0.004994
10         ratioWords_ArticleTitlePostTitle   0.004707
11             diffChars_PostTitlePostImage   0.003067
12                numQuestionmarksPostTitle   0.001995
13            ratioWords_PostImagePostTitle   0.000725
14            ratioChars_PostImagePostTitle   0.000000
15          ratioWords_ArticleDescPostTitle   0.000000
Standard scaling
Information gain of whole dataset
                        

Unnamed: 0,Feature Name,Info Gain
1,ratioChars_ArticleParagraphsPostTitle,0.040017
2,numFormalWords_PostTitle,0.024087
3,ratioChars_ArticleDescPostTitle,0.02176
4,numWords_PostTitle,0.017788
5,ratioChars_ArticleParagraphsArticleDesc,0.014178
6,ratioChars_ArticleTitlePostTitle,0.010832
7,numChars_PostTitle,0.008457
8,diffWords_PostTitleArticleKeywords,0.005295
9,diffChars_PostTitleArticleKeywords,0.004994
10,ratioWords_ArticleTitlePostTitle,0.004707


In [60]:
# classi.optimize()
# classi.cross_val()
classi.test()

-- Performance on split: 80% train - 20% split --
              precision    recall  f1-score   support

no-clickbait       0.70      0.97      0.81       338
   clickbait       0.55      0.07      0.13       154

   micro avg       0.69      0.69      0.69       492
   macro avg       0.62      0.52      0.47       492
weighted avg       0.65      0.69      0.60       492

AUC on prediction: 0.5224006762468301
AUC on probabilities: 0.6524244985783447
Confusion matrix:
[[329   9]
 [143  11]]
-- Finished test reports --
