<a href="https://colab.research.google.com/github/timthedev07/text-summarization/blob/dev/model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download the dataset.

In [None]:
!rm -rf ./*
!mkdir data
%cd data
!wget 'https://storage.googleapis.com/kaggle-data-sets/18/2157/compressed/Reviews.csv.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220814%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220814T105430Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=9c43ef3996dc11896c1694490ef83cc73a3a92c7210ad6df6cc67bcfc6f28c97e837a40dbb974115ae28cad39ee9e42fc6a3ec97c0d1f8134ca63a3258a12f19f1f447390ca305c08564089870d787a63807075324191163a478cd165a71182d236cd42722d65f8f69fc87f8949921811f73f6c64f9075df58909cc4b95cc56d02e763df766a1b0d585a61117bcf2b4fbe1c47da6a19440599b3fcfba621859e4529e30fa2967cfb328ef041f836647c33381e3781943baceb1ea0004fa9d95ed5af3c3dd34a6e6ca9139d92ced22d9807530453318253ea630c8a02f8a21857bb98d97839100a742a8494aaea7fd31f14af098db2a597d0fe45587463a3fda7' -O reviews.csv.zip
!unzip -q reviews.csv.zip
!mv Reviews.csv reviews.csv
%cd ../


Import the required libraries.

In [None]:
import os
import re           
import numpy as np  
import pandas as pd 
import string
# for preprocessing
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords 
import nltk
from bs4 import BeautifulSoup 
# tf
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

Ensure that the corpus `stopwords` is downloaded.

In [None]:
nltk.download('stopwords')

Some constants.

In [None]:
DATA_DIR = "data"
CSV_FILENAME = "reviews.csv"

Read the dataset.

In [None]:
pathJoin = os.path.join

def readDataset():
  df = pd.read_csv(pathJoin(DATA_DIR, CSV_FILENAME), usecols=["Summary", "Text"])
  df.drop_duplicates(subset=['Text'], inplace=True)
  df.dropna(axis=0,inplace=True)
  return df

Function for embedding special tokens into our $y$ values(summaries) to indicate the start and end of a sequence.

In [None]:
def embedToken(summary: str):
  return "_A_ " + summary + " _B_"

Data cleaning.

In [None]:
stop_words = set(stopwords.words('english'))
if "not" in stop_words:
  stop_words.remove("not")

def decontracted(phrase: str):
  """
  https://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
  """
  phrase = re.sub(r"won\'t", "will not", phrase)
  phrase = re.sub(r"can\'t", "can not", phrase)

  phrase = re.sub(r"n\'t", " not", phrase)
  phrase = re.sub(r"\'re", " are", phrase)
  phrase = re.sub(r"\'s", " is", phrase)
  phrase = re.sub(r"\'d", " would", phrase)
  phrase = re.sub(r"\'ll", " will", phrase)
  phrase = re.sub(r"\'t", " not", phrase)
  phrase = re.sub(r"\'ve", " have", phrase)
  phrase = re.sub(r"\'m", " am", phrase)
  return phrase

def textClean(text: str):
  text = decontracted(text.lower())
  text = BeautifulSoup(text, "html.parser").text

  text = re.sub(r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?", " ", text)
  text = re.sub(r"@([A-Za-z0-9_]+)", " ", text)
  text = re.sub(r"\([^)]*\)", "", text)

  for i in stop_words:
    text = re.sub(f'[^A-Za-z0-9_]+{i}[^A-Za-z0-9_]+', " ", text)
  
  text = re.sub(
    f"[{re.escape(string.punctuation)}]", "", text
  )

  return text

def getCleanedDataset(dataset: pd.DataFrame) -> pd.DataFrame:
  """
  Takes a dataframe containing these two required two columns:
    - Summary
    - Text
  
  The output data frame contains the cleaned texts stored in the following rows:
    - summaryCleaned
    - textCleaned
  """
  summaryCleaned = [embedToken(textClean(i)) for i in dataset["Summary"]]
  textCleaned = [textClean(i) for i in dataset["Text"]]

  cleaned = pd.DataFrame()
  cleaned["summaryCleaned"] = summaryCleaned
  cleaned["textCleaned"] = textCleaned

  noAlphabet = r"^[^a-zA-Z]+$"

  cleaned['summaryCleaned'].replace(noAlphabet, np.nan, inplace=True, regex=True)
  cleaned['textCleaned'].replace(noAlphabet, np.nan, inplace=True, regex=True)
  cleaned.dropna(axis=0,inplace=True)

  return cleaned

getCleanedDataset(readDataset().head())
