**Processing notebook** <br>
This notebook preprocesses textual data. It loads the data (from the clickbait-challenge site) and does the following: specifically:
- remove stop words <br>
- stem using a porter stemmer <br>
- remove HTML tags <br>
- trim white saces <br>
- replace numbers with special [N] token <br>
- tokenize  <br>


In [15]:
import time
import re
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize 
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[0. 1. 1.]


In [0]:
#a simple function to concatenate a list of strings into one string
def combine_into_one_string(list_of_strings):
    x = ''
    for s in list_of_strings:
        x += s
    return x

#function for preprocessing according to techniques in papers read
def prepare_text(text):
    stop_words = set(stopwords.words('english')) 
    porter_stemmer = PorterStemmer() 

    HTML_tags = re.compile(r'<.+?>')
    NON_ALPHA_NUMERIC = re.compile(r'\W+')
    numbers = re.compile(r'\d+')
    whitespace = re.compile(r'\s+')
    
    preped_text = ''
    word_tokens = word_tokenize(text) 
    for token in word_tokens:
        
        token = token.lower()
        token = token.replace('–', '-')
        token = token.replace("'", ' ')
        token = HTML_tags.sub(' ', text)
        token = NON_ALPHA_NUMERIC.sub(' ', text)
        token = porter_stemmer.stem(token)
        if token not in stop_words:
            preped_text += token
        preped_text = numbers.sub(' [N]', preped_text)
        preped_text = whitespace.sub(' ', preped_text).strip()

    return preped_text

start_time = time.time()


In [0]:
start_time = time.time()

#load first raw data file
df_truth1 = pd.read_json('data/raw/truth1.jsonl', lines='True')
df_instances1 = pd.read_json('data/raw/instances1.jsonl', lines='True')
df_1 = df_instances1.merge(df_truth1, how='inner', on='id')

#load second raw data file
df_truth2 = pd.read_json('data/raw/truth2.jsonl', lines='True')
df_instances2 = pd.read_json('data/raw/instances2.jsonl', lines='True')
df_2 = df_instances2.merge(df_truth2, how='inner', on='id')

#concatenate
df_final = pd.concat([df_1, df_2])



#edit this array to process features
features= ['postText', 'targetKeywords'] 
for feature in features:
     feature_start_time = time.time()
     df_final[feature] = df_final[feature].transform(func=combine_into_one_string)
     df_final[feature] = df_final[feature].transform(func=prepare_text)
     print(f'Finished preprocessing {feature}. Total time for preprocessing {feature} was {time.time() - feature_start_time}s')
    
print(f'Total preprocess time: {time.time()-start_time}s')

print(df_final.head())

Finished preprocessing postText. Total time for preprocessing postText was 23.023833751678467s
Finished preprocessing targetKeywords. Total time for preprocessing targetKeywords was 73.52147960662842s
Total preprocess time: 97.62794876098633s
                   id                       postMedia  \
0  608310377143799808                              []   
1  609297109095972864  [media/609297109095972864.jpg]   
2  609504474621612032                              []   
3  609748367049105408                              []   
4  608688782821453824  [media/608688782821453825.jpg]   

                                            postText  \
0  apple s ios [N] app thinning feature will give...   
1  rt kenbrown [N] emerging market investors are ...   
2  u s soccer should start answering tough questi...   
3  how theme parks like disney world left the mid...   
4  could light bulbs hurt your health one company...   

                    postTimestamp  \
0  Tue Jun 09 16:31:10 +0000 2015   
1  

In [0]:
#output data
df_final.to_csv('data/prepped/prepped.csv', index=False)