# motive: make a function to preprocess text data with minimal data loss

In [1]:
import numpy as np 
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, strip_accents_unicode
import spacy
import en_core_web_sm
import pke
import re

from nltk.corpus import stopwords

from unidecode import unidecode

from tqdm import tqdm
tqdm.pandas()

In [2]:
data = pd.read_csv("../data/bbc_toi_yahoo_stats_feats.csv")
data = data.loc[:, ["heading", "content"]]
data.head(5)

Unnamed: 0,heading,content
0,domestic abuse: swansea man jailed for murderi...,a man has been jailed for life for battering h...
1,covid-19: how india failed to prevent a deadly...,"in early march, india's health minister harsh ..."
2,"northampton blaze 'cruel blow', say firm's owners",two friends who spent 10 years building up the...
3,einstein handwritten letter with equation fetc...,a letter written by albert einstein containing...
4,florida high school alters 80 'immodest' yearb...,a florida high school is facing a backlash for...


## make a function to:
- find named entity
- find keyphrases
- tokenize
- lemmatize/stem
- vectorize

In [16]:
def preprocess(text):
    text_prep = "" # add tokens to this
    
    # remove stopwords and extract keyphrases
    kp_extractor = pke.unsupervised.YAKE()
    stoplist = stopwords.words("english")
    kp_extractor.load_document(input=text, language="en", normalization=None)
    kp_extractor.candidate_selection(n=3, stoplist=stoplist)
    kp_extractor.candidate_weighting(
        window=2,
        stoplist=stoplist,
        use_stems=False
    )
    keyphrases = kp_extractor.get_n_best(
        n=100,
        threshold=0.7
    )
    keyphrases = " ".join([re.sub("""[!,*)@#%(&$_?.^"']""",'', keyphrase[0]) for keyphrase in keyphrases])
    text_prep += " ".join(list(set(keyphrases.split(" "))))

    # return preprocessed text
    return text_prep

In [21]:
data_for_prep = pd.read_csv("../data/bbc_toi_yahoo_stats_feats.csv")
data_for_prep = data_for_prep.loc[:, ["heading", "content"]]

# preprocess
data_for_prep.loc[:, "heading"] = data_for_prep.loc[:, "heading"].progress_apply(preprocess)
data_for_prep.loc[:, "content"] = data_for_prep.loc[:, "content"].progress_apply(preprocess)

print()
print(data_for_prep.loc[3 , "content"])

  0%|          | 20/7900 [00:13<1:25:37,  1.53it/s]


KeyboardInterrupt: 

In [18]:
print(len(data_for_prep.loc[3 , "content"]))

527


In [19]:
print(len(data.loc[3 , "content"]))

1475


In [20]:
print(data.loc[3 , "content"])

a letter written by albert einstein containing his e=mc² equation has sold at auction in the us for more than $1.2m (£850,000) - three times more than had been expected. experts say there are only three other known examples of the equation in the physicist's handwriting. the equation was first published in a scientific paper by einstein in 1905. it explains the interchangeability of energy and mass. the equation - energy equals mass times the speed of light squared - is a fundamental concept in modern physics. the man who made einstein world-famousnew einstein manuscripts unveiled this was the only example of the equation in a private collection and only became public recently, said boston-based rr auction, which sold the letter. "[it's] an important letter from both a holographic and a physics point of view," rr auction said in a statement, calling it "the most well-known equation ever set forth". the one-page letter in german is dated 26 october 1946 and addressed to polish-american 