In [95]:
import pandas as pd
import os
from ast import literal_eval
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoModel, AutoTokenizer, AutoModelForSeq2SeqLM

from typing import List
import numpy as np
import pandas as pd

import torch
from sentence_transformers import SentenceTransformer, util

import re
import random

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

from nltk.stem import PorterStemmer  
porter_stemmer = PorterStemmer()

stop_words = set(stopwords.words())

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import networkx as nx
import community.community_louvain as community
from ast import literal_eval
import re
from collections import defaultdict, Counter

[nltk_data] Downloading package stopwords to /home/selim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/selim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/selim/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/selim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def flatten(t):
    return [item for sublist in t for item in sublist]

DATA_PATH = os.path.join(
    '..', '..', '..', '..', "data", "frameworks_data", 'data_v0.7.1'
)

full_df = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset_with_translations.csv'))
full_df['sectors'] = full_df.sectors.apply(literal_eval)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [99]:
between_sequence_links = [
    'Furthermore, ', 'Aditionally, ', 'Moreover, ', 'Besides, ', 'Aside from that, ', 'Also, ', 'In addition to that, ',
    'On the other hand, ', 'On the other side, '
    ]
final_sent_link = ['Finally, ', 'Ultimately, ']
first_sent_link = ['Firstly, ', 'First of all, ', 'In the first place, ']

project_id_to_name = {
    1621: 'DFS Libya',
    2225: 'IMMAP/DFS RDC',
    2311: 'IMMAP/DFS Colombia'
}

def preprocess_entry(entry):
    def remove_punct(sentence):
        import string
        return sentence.translate(str.maketrans('', '', string.punctuation))

    def omit_stop_words(sentence):
        return ' '.join([word for word in sentence.split() if word.lower() not in stop_words])

    return omit_stop_words(remove_punct(entry))

def get_similarity_matrix(texts):

    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings_all_sentences = model.encode(texts)

    similarity = cosine_similarity(embeddings_all_sentences, embeddings_all_sentences)
    return similarity

def build_graph(cosine_similarity_matrix):
    """
    function to build graoh from similarity matrix
    """
    graph_one_lang = nx.Graph()
    matrix_shape = cosine_similarity_matrix.shape
    for i in range (matrix_shape[0]):
        for j in range (matrix_shape[1]):
            #do only once
            if i < j:
                sim = cosine_similarity_matrix[i, j]
                graph_one_lang.add_edge(i, j, weight=sim)
                graph_one_lang.add_edge(j, i, weight=sim)

    return graph_one_lang

def get_sentences_to_omit(original_tweets: List[str]):
    
    cosine_similarity_matrix = get_similarity_matrix(original_tweets)
    
    #cosine_similarity_matrix
    too_similar_ids = np.argwhere(cosine_similarity_matrix > 0.993)

    sentences_to_omit = []
    for pair_ids in too_similar_ids:
        if pair_ids[0]<pair_ids[1]:
            sentences_to_omit.append(pair_ids[1])

    return sentences_to_omit

def summarize_paragraph(entries):
    model_name = "csebuetnlp/mT5_multilingual_XLSum"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))

    input_ids = tokenizer(
        [WHITESPACE_HANDLER(entries)],
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=512
    )["input_ids"]

    output_ids = model.generate(
        input_ids=input_ids,
        max_length=84,
        no_repeat_ngram_size=2,
        num_beams=4
    )[0]

    summary = tokenizer.decode(
        output_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return summary

def get_summary(full_df: pd.DataFrame, tag: str, project_id: int):

    """
    main function used for getting summary
    """
    df = full_df[full_df.project_id==project_id].copy()
    df_one_sector = df[df.sectors.apply(lambda x: tag in x)][['entry_id', 'excerpt']]

    final_summary = []

    original_tweets = df_one_sector.excerpt.tolist() 

    cosine_similarity_matrix = get_similarity_matrix(original_tweets)

    graph_one_lang = build_graph(cosine_similarity_matrix)

    scores = nx.pagerank(graph_one_lang)

    top_5_sentence_ids = np.argsort(np.array(list(scores.values())))[::-1][:5]
    used_ids = []

    for id_tmp in top_5_sentence_ids:
        row_id = cosine_similarity_matrix[id_tmp, :]
        top_id_row = np.argsort(row_id)[::-1]
        top_id_row = [id for id in top_id_row if id not in used_ids and id not in top_5_sentence_ids][:1]

        top_2_id_row = [id_tmp] + top_id_row

        used_ids += top_2_id_row
        ranked_sentence = ' '.join([original_tweets[id_tmp] for id_tmp in top_2_id_row])
        summarized_entries = summarize_paragraph(ranked_sentence)

        #clean summary to omit too similar senteneces
        summarized_entries_as_sentences = nltk.tokenize.sent_tokenize(summarized_entries)
        sentence_ids_to_omit = get_sentences_to_omit(summarized_entries_as_sentences)
        text_one_cluster = [
            summarized_entries_as_sentences[i] for i in range (len(summarized_entries_as_sentences))\
                if i not in sentence_ids_to_omit
            ]
        clean_summary_one_cluster = ' '.join(text_one_cluster)
        final_summary.append(clean_summary_one_cluster)

    n_clusters = len(final_summary)
    final_summary_str = ''

    for i in range (n_clusters):
        
        final_summary_str = final_summary_str + final_summary[i] + '\n'
    
    return final_summary_str

In [100]:
columns = ['Food Security', 'Protection']

for one_project_id in list(project_id_to_name.keys()):
    print(f'FOR THE PROJECT {project_id_to_name[one_project_id]}')
    for tag in columns:
        print(tag)
        partitions = get_summary(full_df, tag, one_project_id)
        print(partitions)
        print('')

FOR THE PROJECT DFS Libya
Food Security


Downloading: 100%|██████████| 730/730 [00:00<00:00, 67.1kB/s]
Downloading: 100%|██████████| 4.31M/4.31M [00:00<00:00, 9.54MB/s]
Downloading: 100%|██████████| 65.0/65.0 [00:00<00:00, 20.5kB/s]
Downloading: 100%|██████████| 375/375 [00:00<00:00, 144kB/s]
Downloading:  62%|██████▏   | 1.45G/2.33G [01:37<00:56, 15.7MB/s]

KeyboardInterrupt: 

Downloading:  62%|██████▏   | 1.45G/2.33G [01:49<00:56, 15.7MB/s]