# Text Summary cnn daily mail

In [1]:
import numpy as np

import tensorflow_datasets as tfds
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
print("tf-version", tf.__version__)

tf-version 2.1.0


In [3]:
MAX_LENGTH = 400

BUFFER_SIZE = 20000

BATCH_SIZE = 16

## Get the dataset

In [4]:
cnn_dailymail = tfds.load(name="cnn_dailymail")

In [5]:
train = cnn_dailymail['train']
test = cnn_dailymail['test']

In [6]:
for item in train.take(1):
    print(item.keys(), "\n")
    print(item['article'], "\n")
    print(item['highlights'])

dict_keys(['article', 'highlights']) 

tf.Tensor(b"By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 . The bishop of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A virus in late September and early October. The state Health Department has issued an advisory of exposure for anyone who attended five churches and took communion. Bishop John Folda (pictured) of the Fargo Catholic Diocese in North Dakota has exposed potentially hundreds of church members in Fargo, Grand Forks and Jamestown to the hepatitis A . State Immunization Program Manager Molly Howell says the risk is low, but officials feel it's important to alert people to the possible exposure. The diocese announced on Monday that Bishop John Folda is taking time off after being diagnosed with hepatitis A. The diocese says he contracted the infection through contaminated foo

## Clean Dataset
- Remove the first part of the article 
(By . Associated Press . PUBLISHED: . 14:11 EST, 25 October 2013 . | . UPDATED: . 15:36 EST, 25 October 2013 .)

- then add a "start" and "stop" Token

In [10]:
def normalize_text(text):
    """Lowercase and remove quotes from a TensorFlow string."""
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text,"'(.*)'", r"\1")
    return text


def map_func(features):
    article_text = normalize_text(features["article"])  
    highlights_text = normalize_text(features['highlights'])
    
    return article_text, highlights_text



train_ds = train.map(map_func)
test_ds = test.map(map_func)
for article, highlights in train_ds.take(1):
    print(article, "\n")
    print(highlights)

tf.Tensor(b"by . associated press . published: . 14:11 est, 25 october 2013 . | . updated: . 15:36 est, 25 october 2013 . the bishop of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a virus in late september and early october. the state health department has issued an advisory of exposure for anyone who attended five churches and took communion. bishop john folda (pictured) of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a . state immunization program manager molly howell says the risk is low, but officials feel it's important to alert people to the possible exposure. the diocese announced on monday that bishop john folda is taking time off after being diagnosed with hepatitis a. the diocese says he contracted the infection through contaminated food while attending a conference for newl

In [11]:
def get_dataset(ds):
    ds = ds.cache()
    ds = ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
    return ds.prefetch(tf.data.experimental.AUTOTUNE)

train_ds = get_dataset(train_ds)
test_ds = get_dataset(test_ds)

# Model
## Use a pretrained Model

https://towardsdatascience.com/simple-bert-using-tensorflow-2-0-132cb19e9b22

## Attention

$$ Attention(Q,K,V) = softmax( \frac{Q*K^{T}}{\sqrt{d_k}} ) * V  $$

Query $Q$ and Key $K$ represent the input, the dot product of them shows which key words the query word should focus on. The softmax gives them probabilistic values which define which words from the Value Matrix $V$ are relevant.


## References
- Vaswani, Ashish, et al. "Attention is all you need." Advances in neural information processing systems. 2017.