<a href="https://colab.research.google.com/github/timthedev07/text-summarization/blob/dev/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download the target dataset.

In [None]:
!rm -rf ./content
!rm -rf ./data
!rm -rf ./sample_data
!mkdir data
%cd data
!wget 'https://storage.googleapis.com/kaggle-data-sets/18/2157/compressed/Reviews.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220814%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220814T105430Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=9c43ef3996dc11896c1694490ef83cc73a3a92c7210ad6df6cc67bcfc6f28c97e837a40dbb974115ae28cad39ee9e42fc6a3ec97c0d1f8134ca63a3258a12f19f1f447390ca305c08564089870d787a63807075324191163a478cd165a71182d236cd42722d65f8f69fc87f8949921811f73f6c64f9075df58909cc4b95cc56d02e763df766a1b0d585a61117bcf2b4fbe1c47da6a19440599b3fcfba621859e4529e30fa2967cfb328ef041f836647c33381e3781943baceb1ea0004fa9d95ed5af3c3dd34a6e6ca9139d92ced22d9807530453318253ea630c8a02f8a21857bb98d97839100a742a8494aaea7fd31f14af098db2a597d0fe45587463a3fda7' -O reviews.csv.zip
!unzip -q reviews.csv.zip
!mv Reviews.csv reviews.csv
!rm -rf reviews.csv.zip
%cd ../


Import the required libraries.

In [None]:
import os
import re           
import numpy as np  
import pandas as pd 
import string
# for preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer 
import tensorflow as tf
from nltk.corpus import stopwords 
import nltk
from bs4 import BeautifulSoup 
# machine learning
from tensorflow.keras.layers import InputLayer, Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional, Attention
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

Ensure that the corpus `stopwords` is downloaded.

In [None]:
nltk.download('stopwords')

Some constants.

In [None]:
DATA_DIR = "data"
CSV_FILENAME = "reviews.csv"
SUMMARY_TOKEN_START = "_A_"
SUMMARY_TOKEN_END = "_B_"

Read the dataset.

In [None]:
pathJoin = os.path.join

def readDataset():
    df = pd.read_csv(pathJoin(DATA_DIR, CSV_FILENAME), usecols=["Summary", "Text"])
    df.drop_duplicates(subset=['Text'], inplace=True)
    df.dropna(axis=0,inplace=True)
    return df

Function for embedding special tokens into our $y$ values(summaries) to indicate the start and end of a sequence.

In [None]:
def embedToken(summary: str):
    return f"{SUMMARY_TOKEN_START} {summary} {SUMMARY_TOKEN_END}"

Data cleaning.

In [None]:
stop_words = set(stopwords.words('english'))
if "not" in stop_words:
    stop_words.remove("not")

def decontracted(phrase: str):
    """
    https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
    """
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def textClean(text: str):
    text = decontracted(text.lower())
    text = BeautifulSoup(text, "html.parser").text

    text = re.sub(r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?", " ", text)
    text = re.sub(r"@([A-Za-z0-9_]+)", " ", text)
    text = re.sub(r"\([^)]*\)", "", text)

    for i in stop_words:
        text = re.sub(f'[^A-Za-z0-9_]+{i}[^A-Za-z0-9_]+', " ", text)
    
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )

    return text

def getCleanedDataset(dataset: pd.DataFrame, savedCSVPath = None) -> pd.DataFrame:
    """
    Takes a dataframe containing these two required two columns:
        - Summary
        - Text
    
    The param `savedCSVPath` is used to indicate the path to the saved CSV file containing the cleaned data if this step is previously done.
    
    The output data frame contains the cleaned texts stored in the following rows:
        - summaryCleaned
        - textCleaned
    """
    if savedCSVPath:
        return pd.read_csv(savedCSVPath)

    summaryCleaned = [embedToken(textClean(i)) for i in dataset["Summary"]]
    textCleaned = [textClean(i) for i in dataset["Text"]]

    cleaned = pd.DataFrame()
    cleaned["summaryCleaned"] = summaryCleaned
    cleaned["textCleaned"] = textCleaned

    noAlphabet = r"^[^a-zA-Z]+$"

    cleaned['summaryCleaned'].replace(noAlphabet, np.nan, inplace=True, regex=True)
    cleaned['textCleaned'].replace(noAlphabet, np.nan, inplace=True, regex=True)
    cleaned.dropna(axis=0,inplace=True)

    return cleaned

getCleanedDataset(readDataset().head())


Load all the data

In [None]:
# allData = getCleanedDataset(readDataset())
from google.colab import drive
drive.mount('/content/gdrive')

!cp "/content/gdrive/My Drive/datasets/text-summarization/cleaned.csv.zip" data
%cd data
!unzip -ojq cleaned.csv.zip
!rm -rf cleaned.csv.zip
%cd ..
allData = getCleanedDataset(None, savedCSVPath = pathJoin(DATA_DIR, "cleaned.csv"))

Investigate the lengths of our input data.

In [None]:
import matplotlib.pyplot as plt
text_word_count = []
summary_word_count = []

# populate the lists with sentence lengths
for i in allData['textCleaned']:
    text_word_count.append(len(i.split()))

for i in allData['summaryCleaned']:
    summary_word_count.append(len(i.split()))

length_df = pd.DataFrame({
    'text': text_word_count,
    'summary': summary_word_count
})
length_df.hist(bins = 30)
plt.show()

Export and compress the cleaned data to my drive.

**_Only run this step if there isn't a pre-processed csv file._**

In [None]:
allData.to_csv(pathJoin(DATA_DIR, "/content/gdrive/My Drive/datasets/text-summarization/cleaned.csv"), index=False)
!zip -r "/content/gdrive/My Drive/datasets/text-summarization/cleaned.csv.zip" "/content/gdrive/My Drive/datasets/text-summarization/cleaned.csv"
!rm -rf "/content/gdrive/My Drive/datasets/text-summarization/cleaned.csv"

NOTE: the pre-processed csv can also be found [here](https://drive.google.com/file/d/1-01QGJvxBE-i9MsA5-PUVXSW1VbF4zBP/view?usp=sharing).

In [None]:
# Prepare for padding
textMaxLen=150 
summaryMaxLen=15

Prepare training and testing data.

In [None]:
trainX, testX, trainY, testY = train_test_split(allData["textCleaned"], allData["summaryCleaned"], test_size=0.2, random_state=42)

Model building.

In [None]:
class TextSummarization:
    def __init__(self, trainX, trainY, testX, testY):
        """
        The x and y values passed in to this function should simply be processed texts
        in the form of arrays of plain strings. More processing will be done on the data
        underneath the hood of this class.
        """
        self.trainX = trainX
        self.trainY = trainY
        self.testX = testX
        self.testY = testY

        self.textTokenizer = Tokenizer()
        self.summaryTokenizer = Tokenizer()

        self.TEXT_MAX_LEN=150
        self.SUMMARY_MAX_LEN=15

        self.encoder = None
        self.decoder = None

        self.textVocabSize = None
        self.summaryVocabSize = None

        self.latentDim = 500 

    def tokenize(self):
        # training the tokenizers
        self.textTokenizer.fit_on_texts(list(self.trainX) + list(self.testX))
        self.summaryTokenizer.fit_on_texts(list(self.trainY) + list(self.testY))

        # tokenize the data
        self.trainX = self.textTokenizer.texts_to_sequences(self.trainX) 
        self.testX = self.textTokenizer.texts_to_sequences(self.testX) 

        self.trainX = self.summaryTokenizer.texts_to_sequences(self.trainY)
        self.testY = self.summaryTokenizer.texts_to_sequences(self.testY) 

        # apply padding to unify the shape
        pad = tf.keras.utils.pad_sequences
        self.trainX = pad(self.trainX, maxlen=self.TEXT_MAX_LEN, padding="post")
        self.testX = pad(self.testX, maxlen=self.TEXT_MAX_LEN, padding="post")
        
        self.trainY = pad(self.trainY, maxlen=self.SUMMARY_MAX_LEN, padding="post")
        self.testY = pad(self.testY, maxlen=self.SUMMARY_MAX_LEN, padding="post")

        self.textVocabSize = len(self.textTokenizer.word_index) + 1
        self.summaryVocabSize = len(self.summaryTokenizer.word_index) + 1

    def initEncoder(self):
        encoder = tf.keras.Sequential([
        Embedding(self.textVocabSize, self.latentDim, trainable = True, input_shape=(self.TEXT_MAX_LEN, )),
        ] + [
        LSTM(self.latentDim, return_sequences = True, return_state = True)
        for _ in range(3)
        ]) 

        self.encoder = encoder

    def initDecoder(self):
        """
        Must be called after `initEncoder`
        """
        decoderInput = Input
        # LSTMOutput = tf.keras.Sequential([
        #     Embedding(self.summaryVocabSize, self.latentDim, trainable = True),
        #     LSTM(self.latentDim, return_sequences = True, return_state = True)
        # ])

        # attentionOutput, _ = Attention()()

        # Concatenated = Concatenate(axis = -1)([LSTMOutput, ])
