In [1]:
import math
import dash
from dash import dcc as dcc
from dash import html as html
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from dash.dependencies import Input, Output, State
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import T5Tokenizer, T5ForConditionalGeneration

import nltk
import re
import math
import contractions
import string
import networkx as nx
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# read all parquet files from a folder containing multiple parquet files in pandas dataframe
named_entities = pd.read_parquet('./named_entities/', engine='pyarrow')

In [3]:
explode_df = named_entities.explode("named_entities").rename(columns={"named_entities": "entity"})

In [4]:
# get the id and entity columns only
entity_df = explode_df[['id', 'entity']]

In [5]:
entity_df["entity_name"] = entity_df["entity"].str[1]

In [6]:
# create a list of unique entities from entity_name column in entity_df
unique_entities = entity_df['entity_name'].str.lower().unique().tolist()

In [7]:
# a function that takes an input string and returns a list of ids that contain the input string from the entity_df dataframe
def get_ids(input_string):
    # Replace NaN values in 'entity_name' column with empty strings
    entity_df['entity_name'] = entity_df['entity_name'].fillna('')
    
    # Filter the DataFrame based on the exact input_string and return unique IDs as a list
    return entity_df[entity_df['entity_name'].str.lower() == input_string.lower()]['id'].unique().tolist()


In [8]:
# a function that takes a list of ids and returns a list of articles that contain the ids from the named_entities dataframe
def get_articles(ids):
    # Filter the DataFrame based on the ids list and return unique articles as a list
    return named_entities[named_entities['id'].isin(ids)]['article'].unique().tolist()

In [9]:
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")

In [10]:
def get_summary(article):
    
    input_text = article
    max_length = math.ceil(len(article)/10)
    
    if max_length > 1024:
        max_length = 1024
    elif max_length < 56:
        max_length = 56
    
    inputs = tokenizer(input_text, truncation=True, padding="longest", max_length=1024, return_tensors="pt")
    summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=max_length, early_stopping=True)
    
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [11]:
# a function that takes a list of articles and returns a list of summaries of the articles
def get_summaries(articles):
    # Create an empty list to store summaries
    summaries = []
    
    # Loop through all articles and append the summary to the list
    for article in articles:
        summaries.append(get_summary(article))
    
    return summaries

In [13]:
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_tf_idf_text(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"[^\w\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    preprocessed_text = ' '.join(tokens)
    preprocessed_text = contractions.fix(preprocessed_text)
    
    return preprocessed_text

def tf_score(word, sentence):
    word_frequency_in_sentence = sentence.split().count(word)
    len_sentence = len(sentence.split())
    tf = word_frequency_in_sentence / len_sentence
    return tf

def idf_score(no_of_sentences, word, sentences):
    no_of_sentence_containing_word = sum(1 for sentence in sentences if word in sentence)
    idf = math.log10(no_of_sentences / (no_of_sentence_containing_word + 1))
    return idf

def get_tf_idf_summary(custom_text, percentage):

    no_of_sentences = math.ceil(len(sent_tokenize(custom_text)) * (percentage/100))
    sentences = sent_tokenize(custom_text)
    
    sentences_tf_idf = {}
    for i, sentence in enumerate(sentences, 1):
        sentence_tf_idf = 0
        sentence = re.sub(r'\d+', '', sentence)
        pos_tagged_sentence = nltk.pos_tag(sentence.split())
        for word, pos_tag in pos_tagged_sentence:
            if word.lower() not in stop_words and len(word) > 1 and pos_tag.startswith(('NN', 'VB')):
                word = lemmatizer.lemmatize(word.lower())
                tf = tf_score(word, sentence)
                idf = idf_score(len(sentences), word, sentences)
                tf_idf = tf * idf
                sentence_tf_idf += tf_idf
        sentences_tf_idf[i] = sentence_tf_idf
        
    sentences_tf_idf = sorted(sentences_tf_idf.items(), key=lambda x: x[1], reverse=True)
    
    
    summary = []
    sentence_no = [x[0] for x in sentences_tf_idf[:no_of_sentences]]
    sentence_no.sort()
    
    for i, sentence in enumerate(sentences, 1):
        if i in sentence_no:
            summary.append(sentence)
    return " ".join(summary)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vijay\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
def preprocess_text_rank_lsa_text(text):
    text = text.lower()
    sentences = sent_tokenize(text)
    stop_words = set(stopwords.words('english'))
    preprocessed_sentences = []
    for sentence in sentences:
        sentence = sentence.translate(str.maketrans("", "", string.punctuation))
        sentence = re.sub(r"[^\w\s]", "", sentence)
        sentence = contractions.fix(sentence)
        tokens = word_tokenize(sentence)
        tokens = [token for token in tokens if token not in stop_words]
        preprocessed_sentences.append(tokens)
    return preprocessed_sentences, sentences

In [15]:
def build_similarity_matrix(sentences):
    sentence_vectors = []
    for sentence in sentences:
        sentence_vectors.append(' '.join(sentence))
    vectorizer = TfidfVectorizer().fit_transform(sentence_vectors)
    similarity_matrix = cosine_similarity(vectorizer)
    return similarity_matrix

def get_text_rank_summary(custom_text, percentage):
    processed_article,sentence_tokens = preprocess_text_rank_lsa_text(custom_text)
    similarity_matrix = build_similarity_matrix(processed_article)
    top_n=math.ceil(len(sentence_tokens) * (percentage/100))
    graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentence_tokens)), reverse=True)
    sentence_array = [sentence[1] for sentence in ranked_sentences[:top_n]]
    return ''.join([''.join(sentence) for sentence in sentence_array])

In [16]:
def get_lsa_summary(custom_text, percentage):
    processed_article,sentence_tokens = preprocess_text_rank_lsa_text(custom_text) 
    sentences = processed_article
    num_sentences=math.ceil(len(sentence_tokens) * (percentage/100))
    sentence_vectors = []
    for sentence in sentences:
        sentence_vectors.append(' '.join(sentence))
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentence_vectors)
    lsa_model = TruncatedSVD(n_components=num_sentences)
    lsa_matrix = lsa_model.fit_transform(tfidf_matrix)
    sentence_scores = lsa_matrix.sum(axis=1)
    ranked_sentences = sorted(((sentence_scores[i], s) for i, s in enumerate(sentence_tokens)), reverse=True)
    sentence_array = [sentence[1] for sentence in ranked_sentences[:num_sentences]]
    return ''.join([''.join(sentence) for sentence in sentence_array])


In [17]:
def get_t5_summary(custom_text, percentage):
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    input_text = custom_text
    total_length = len(custom_text)
    summary_length = math.ceil(total_length * (percentage / 100))
    
    if total_length > 512:
        # Divide the input text into equal parts
        num_parts = math.ceil(total_length / 512)
        text_parts = [input_text[i * 512:(i + 1) * 512] for i in range(num_parts)]
        max_input_length = math.ceil(total_length/num_parts)
        max_length = summary_length/num_parts
        if max_length > 512:
            max_length = 512
        elif max_length < 56:
            max_length = 56
        # Generate summaries for each part
        summaries = []
        for part in text_parts:
            input_ids = tokenizer.encode(part, truncation=True, max_length=max_input_length, return_tensors="pt")
            summary_ids = model.generate(input_ids, max_length=max_length)
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            summaries.append(summary)

        # Combine the summaries
        combined_summary = " ".join(summaries)
    else:
        max_length = summary_length
        if max_length > 512:
            max_length = 512
        elif max_length < 56:
            max_length = 56
    
        input_ids = tokenizer.encode(input_text, truncation=True, max_length=total_length, return_tensors="pt")
        summary_ids = model.generate(input_ids, max_length=max_length)
        
        combined_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return combined_summary

In [18]:
def get_bart_summary(custom_text, percentage):
    model_name = 'facebook/bart-large-cnn'
    tokenizer = BartTokenizer.from_pretrained(model_name)
    model = BartForConditionalGeneration.from_pretrained(model_name)
    input_text = custom_text
    total_length = len(custom_text)
    summary_length = math.ceil(total_length * (percentage / 100))
    
    if total_length > 1024:
        # Divide the input text into equal parts
        num_parts = math.ceil(total_length / 1024)
        text_parts = [input_text[i * 1024:(i + 1) * 1024] for i in range(num_parts)]
        max_length = summary_length/num_parts
        if max_length > 1024:
            max_length = 1024
        elif max_length < 56:
            max_length = 56
        # Generate summaries for each part
        summaries = []
        for part in text_parts:
            inputs = tokenizer(part, truncation=True, padding="longest", max_length=1024, return_tensors="pt")
            summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=max_length, early_stopping=True)
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            summaries.append(summary)

        # Combine the summaries
        combined_summary = " ".join(summaries)
    else:
        max_length = summary_length
        if max_length > 1024:
            max_length = 1024
        elif max_length < 56:
            max_length = 56
        inputs = tokenizer(input_text, truncation=True, padding="longest", max_length=1024, return_tensors="pt")
        summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=max_length, early_stopping=True)
        
        combined_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    

    return combined_summary

In [29]:
app = dash.Dash()

# Function to get the article content from the selected suggestion
def get_article_content(suggestion_value, visible_paragraphs):
    if suggestion_value:
        ids = get_ids(suggestion_value)
        article_content = '\n\n'.join(get_summaries(get_articles(ids[:visible_paragraphs])))
        return article_content, ids
    return None, None

def get_updated_article_content(ids, visible_paragraphs):
    if ids:
        article_content = '\n\n'.join(get_summaries(get_articles(ids[visible_paragraphs -3 :visible_paragraphs])))
        return article_content
    return None

# Function to split the article into paragraphs
def split_into_paragraphs(article_content):
    paragraphs = article_content.split('\n\n')
    return paragraphs


# Number of paragraphs to show initially
initial_paragraphs = 3

app.layout = html.Div([
    html.H1(children='Event based multi-document text summarisation of news articles',
            style={'textAlign': 'center', 'color': '#000205'}),
    html.Div(children=[
        html.Div(children='''Vijay Jawali''', style={'textAlign': 'left', 'color': '#000205'}),
        html.Div(children='''Project - Data Science MSc [06 32255]''', style={'textAlign': 'center', 'color': '#000205'}),
        html.Div(children='''2437649''', style={'textAlign': 'right', 'color': '#000205'})
    ], style={'display': 'flex', 'justifyContent': 'space-between'}),
    html.Br(),
    html.Hr(style={'border-top': '1px solid #000000', 'width': '100%'}),
    html.Br(),


    html.H3(children='Select an entity to summarise the news events using BART model',
            style={'textAlign': 'center', 'color': '#000205'}),
    html.H4(children='Search Entity : ',
            style={'color': '#000205'}),

    html.Div([
        dcc.Input(
            id='my-input',
            type='text',
            value='',
            placeholder='Type here...',
            autoComplete='off',
            style={'float': 'left', 'width': '20%', 'margin-right': '20px'}  # Adjust width and margins
        ),
        html.Br(),
        html.Br(),
        
        dcc.Loading(
            id="loading-suggestions-container",
            type="circle",
            children=[html.Div(id='suggestions-container', style={'float': 'left', 'width': '20%'})],
            style={'textAlign': 'left', 'width': '20%'}
        ),
    ]),

    html.Div(id='entity-selected', style={'text-align': 'center', 'margin': '0 auto'}),

    html.Br(),

    html.Div([
        # dcc.Markdown(
        #     id='article-content',
        #     children='',
        #     style={'display': 'none'},
        # ),
        dcc.Loading(
            id="loading-article-content",
            type="default",
            children=[
                dcc.Markdown(
                    id='article-content',
                    children='',
                    style={'display': 'none'},
            )]
        ),
        dcc.Markdown(
            id='article-content-updated', 
            children='',
        ),
        html.Br(),
        html.Br(),
        dcc.Loading(
            id="loading-article-content-updated",
            type="default",
            children=[
                dcc.Markdown(
                    id='article-content-updated-loading',
                    children='',
                    style={'display': 'none'},
                )]
        ),
        html.Br(),
        html.Button('Load More', id='load-more-button', n_clicks=0, style={'margin': '10px', 'display': 'none'}),
        
        dcc.Store(id='hidden-paragraphs', data=[]),
        dcc.Store(id='visible-paragraphs', data=initial_paragraphs),
        dcc.Store(id='article-ids', data=None),
    ], style={'float': 'right', 'width': '70%'}),
    html.Br(),
    html.Br(),
    html.Br(),
    html.Br(),
    html.Hr(style={'border-top': '1px solid #000000', 'width': '100%', 'margin-top': '20px', 'margin-bottom': '20px'}),
    html.Br(),
    html.Br(),
    
    html.H3(children='Summarise your own text document using your own model choice',
            style={'textAlign': 'center', 'color': '#000205'}),
    html.H5(children='Select model',
            style={'textAlign': 'center', 'color': '#000205'}),
    dcc.Dropdown(
        id='model-selection-dropdown',
        options=[
            {'label': 'term frequency inverse document frequency', 'value': 'term frequency inverse document frequency'},
            {'label': 'text rank', 'value': 'text rank'},
            {'label': 'latent semantic analysis', 'value': 'latent semantic analysis'},
            {'label': 't5', 'value': 't5'},
            {'label': 'bart', 'value': 'bart'},
        ],
        value='',  
        style={'width': '50%', 'margin': '0 auto'}, 
    ),
    html.H5(children='Select percentage of text to retain in the summary',
            style={'textAlign': 'center', 'color': '#000205'}),
    dcc.Slider(
        id='summary-percentage-slider',
        min=10, 
        max=90,  
        step=1,  
        value=None,  
        marks={i: f"{i}%" for i in range(10, 91, 10)},  
        included=False,  
        tooltip={'placement': 'bottom'}  
    ),
    html.H4(id='custom-summary', style={'textAlign': 'center', 'color': '#000205', 'display': 'none'}),
    
    
    html.H4(children='Enter your text : ',
            style={'color': '#000205'}),
    
    dcc.Textarea(
        id='custom-text-input',
        placeholder='Type here...',
        style={'width': '100%', 'height': '300px'}
    ),
    html.Div(id='custom-text-length-info', style={'margin-top': '10px'}),
    
    html.Br(),
    html.Button('Get Summary', id='get-summary-button', n_clicks=0, style={'margin': '10px'}),
    html.Br(),
    html.H4(children='Summary : ',
            style={'color': '#000205'}),
    html.Br(),
    dcc.Loading(
        id="loading-summary-output",
        type="circle",
        children=[html.Div(id="summary-output", style={'whiteSpace': 'pre-line'})]
    ),
    html.Br(),
    html.Hr(style={'border-top': '1px solid #000000', 'width': '100%', 'margin-top': '20px', 'margin-bottom': '20px'}),
    html.Br(),
    html.Br(),
    html.Br(),
    html.Br(),
    
])


@app.callback(
    Output('suggestions-container', 'children'),
    [Input('my-input', 'value')],
    [State('my-input', 'id')]
)
def update_suggestions(value, input_id):
    if value:
        filtered_suggestions = [s for s in unique_entities if str(s).lower().startswith(value.lower())]
        if filtered_suggestions:
            # Check if the number of filtered suggestions is more than 10
            if len(filtered_suggestions) > 10:
                # Wrap the RadioItems in a Div with scrollable style
                return html.Div(
                    dcc.RadioItems(
                        id={'type': 'suggestion', 'index': 'ALL'},
                        options=[{'label': str(s), 'value': str(s)} for s in filtered_suggestions],
                        labelStyle={'display': 'block', 'margin-bottom': '5px'},
                        value=''
                    ),
                    style={'max-height': '500px', 'overflow': 'scroll'}  # Set the maximum height and scroll overflow
                )
            else:
                # Display all suggestions if they are less than or equal to 10
                return dcc.RadioItems(
                    id={'type': 'suggestion', 'index': 'ALL'},
                    options=[{'label': str(s), 'value': str(s)} for s in filtered_suggestions],
                    labelStyle={'display': 'block', 'margin-bottom': '5px'},
                    value=''
                )
    return None


@app.callback(
    Output('entity-selected', 'children'),
    [Input({'type': 'suggestion', 'index': 'ALL'}, 'value')]
)
def update_output(suggestion_value):
    if suggestion_value:
        return html.H3('Entity Selected is: ' + str(suggestion_value),
                       style={'textAlign': 'center', 'color': '#000205'})
    return None

@app.callback(
    Output('article-content', 'children'),
    Output('article-ids', 'data'),
    Output('load-more-button', 'style'),
    [Input({'type': 'suggestion', 'index': 'ALL'}, 'value')],
    [State('visible-paragraphs', 'data')]
)
def update_article_content(suggestion_value, visible_paragraphs):
    if suggestion_value:
        article_content, ids = get_article_content(suggestion_value, visible_paragraphs)
        if article_content:
            paragraphs = split_into_paragraphs(article_content)
            return '\n\n'.join(paragraphs), ids, {'display': 'block'}
    return None, None, {'display': 'none'}

@app.callback(
    [Output('hidden-paragraphs', 'data'),
     Output('visible-paragraphs', 'data'),
     Output('article-content-updated', 'children'), Output('article-content-updated-loading', 'children')],
    [Input('load-more-button', 'n_clicks'),
    Input('article-ids', 'data')],
    [State('hidden-paragraphs', 'data'),
     State('visible-paragraphs', 'data'),
     State('article-content', 'children')]
)
def load_more_content(n_clicks, ids, hidden_paragraphs, visible_paragraphs, article_content):
    if article_content is None:
        return dash.no_update, dash.no_update, dash.no_update, dash.no_update
    if n_clicks is None:
        return hidden_paragraphs, visible_paragraphs, None, None
    num_visible_paragraphs =  initial_paragraphs
    if n_clicks > 0:
        num_visible_paragraphs = visible_paragraphs + initial_paragraphs
        updated_article_content = get_updated_article_content(ids, num_visible_paragraphs)
        if updated_article_content:
            article_content =  updated_article_content
    paragraphs = split_into_paragraphs(article_content)
    hidden_paragraphs_to_show = hidden_paragraphs + paragraphs[-3:]
    return hidden_paragraphs_to_show, num_visible_paragraphs, '\n\n'.join(hidden_paragraphs_to_show), None




@app.callback(
    Output('custom-summary', 'children'),
    Output('custom-summary', 'style'),
    [Input('model-selection-dropdown', 'value'), 
     Input('summary-percentage-slider', 'value')],
)
def show_custom_selection(model, percentage):
    if (percentage is None) or (model is None): 
        return dash.no_update, dash.no_update
    return 'Selected model :' + str(model) + ' , ' + 'Selected percentage :' + str(percentage), {'display': 'block', 'textAlign': 'center', 'color': '#000205'}
 
@app.callback(
    Output('custom-text-length-info', 'children'),
    [Input('custom-text-input', 'value')]
)
def update_text_length_info(text):
    max_length = 5000
    length = len(text) if text else 0
    warning_style = {'color': 'red'} if length > max_length else {}
    warning_msg = f' ({length}/{max_length})' if length > max_length else f' ({length}/{max_length})'
    return html.Div([
        html.Span('Text Length:', style={'font-weight': 'bold'}),
        html.Span(warning_msg, style=warning_style)
    ])
    
    
@app.callback(
    Output('summary-output', 'children'),
    [Input('get-summary-button', 'n_clicks')],
    [State('model-selection-dropdown', 'value'),
     State('summary-percentage-slider', 'value'),
     State('custom-text-input', 'value')]
)
def update_summary(n_clicks, model, percentage, custom_text):
    if custom_text:
        if len(custom_text) > 5000:
            warning_style = {'color': 'red'}
            warning_msg = "Please provide text with less than 5000 characters."
            return html.Div([
                html.Span(warning_msg, style=warning_style)
            ])
    
    if n_clicks > 0:
        if custom_text and model and percentage:
            
            if model == 'term frequency inverse document frequency':
                return get_tf_idf_summary(custom_text, percentage)
            elif model == 'text rank':
                return get_text_rank_summary(custom_text, percentage)
            elif model == 'latent semantic analysis':
                return get_lsa_summary(custom_text, percentage)
            elif model == 't5':
                return get_t5_summary(custom_text, percentage)
            elif model == 'bart':
                return get_bart_summary(custom_text, percentage)
        else:
            return "Please provide all necessary inputs to generate the summary."
    return ''

if __name__ == '__main__':
    app.run_server(host='localhost', port=8080, debug=True, suppress_callback_exceptions=True)
