<a href="https://colab.research.google.com/github/ubclaunchpad/Essentially/blob/abstractive-summary/summary_be/ml_notebook/abstractiveSummary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



# Abstractive Summary Model
The notebook covers the following steps:

- Importing pre-trained GloVe embeddings and using them to convert the tokenized words into embedding vectors
- Preprocessing the data by tokenizing the texts and keywords and converting them into numerical representations



In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Access the "ML_DATA/AbstractiveSummary" folder in Google Drive
import os
os.chdir("/content/drive/My Drive/DataML/AbstractiveSummary")

Mounted at /content/drive


In [2]:
# Installing required dependencies
!pip install tensorflow -q
!pip install keras -q
!pip install nltk -q

In [3]:
# import pre-trained GloVe embeddings and use them to convert the tokenized words into embedding vectors
import numpy as np

def load_embeddings(embeddings_file):
    # Load the embeddings into a dictionary, where the keys are the words and the values are the embedding vectors
    embeddings = {}
    with open(embeddings_file, 'r') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def get_word_embeddings(tokenized_texts, embeddings):
    # Convert the tokenized words into embedding vectors using the pre-trained embeddings
    word_embeddings = []
    for text in tokenized_texts:
        text_embeddings = []
        for word in text:
            if word in embeddings:
                text_embeddings.append(embeddings[word])
            else:
                # Use a random embedding vector for out-of-vocabulary words
                text_embeddings.append(np.random.rand(100))
        word_embeddings.append(text_embeddings)
    return word_embeddings

In [4]:
# Prepare the data: Preprocess the data by tokenizing the texts 
# and converting the keywords and texts into numerical representations such as word embeddings.
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def preprocess_data(texts, keywords):
    # Tokenize the texts and keywords
    tokenized_texts = [word_tokenize(text) for text in texts]
    tokenized_keywords = [word_tokenize(keyword) for keyword in keywords]

    # Convert the tokenized data into numerical representations using word embeddings
    embeddings = load_embeddings('glove.6B.100d.txt')
    word_embeddings = get_word_embeddings(tokenized_texts, embeddings)
    keyword_embeddings = get_word_embeddings(tokenized_keywords, embeddings)

    return word_embeddings, keyword_embeddings

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
