In [1]:
import pandas as pd
import os
from ast import literal_eval
from transformers import T5Tokenizer, T5ForConditionalGeneration

from typing import List
import numpy as np
import pandas as pd

import re
import random

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

from nltk.stem import PorterStemmer  
porter_stemmer = PorterStemmer()

stop_words = set(stopwords.words())

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import networkx as nx
import community.community_louvain as community
from ast import literal_eval
import re
from collections import defaultdict, Counter

[nltk_data] Downloading package stopwords to /home/selim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/selim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/selim/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/selim/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def flatten(t):
    return [item for sublist in t for item in sublist]

DATA_PATH = os.path.join(
    '..', '..', '..', '..', "data", "frameworks_data", 'data_v0.7.1'
)

full_df = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset_with_translations.csv'))
full_df['sectors'] = full_df.sectors.apply(literal_eval)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [3]:
libya_entries = full_df[full_df.project_id==1621]
Counter(flatten(libya_entries['sectors'])).most_common(5)

[('Protection', 387),
 ('Health', 377),
 ('Cross', 368),
 ('Food Security', 163),
 ('Livelihoods', 136)]

In [5]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [29]:
between_sequence_links = [
    'Furthermore, ', 'Aditionally, ', 'Moreover, ', 'Besides, ', 'Aside from that, ', 'Also, ', 'In addition to that, ',
    'On the other hand, ', 'On the other side, '
    ]
final_sent_link = ['Finally, ', 'Ultimately, ']
first_sent_link = ['Firstly, ', 'First of all, ', 'In the first place, ']

def clean_tweets(sentence):

    if type(sentence) is not str:
        sentence = str(sentence)

    new_words = []
    words = sentence.split()
    for word in words:
        
        #lower and remove punctuation
        new_word = re.sub(r'[^\w\s]', '', (word))

        #keep clean words and remove hyperlinks
        word_not_nothing = new_word != ''
        word_not_stop_word = new_word.lower() not in stop_words
        #word_not_digit = ~new_word.isdigit()

        if word_not_nothing and word_not_stop_word:

            #lemmatize
            new_word =  wordnet_lemmatizer.lemmatize(new_word, pos="v")  

            #stem
            new_word = porter_stemmer.stem(new_word)

            new_words.append(new_word)
            
    return ' '.join(new_words)

def get_similarity_matrix(original_tweets):
    """
    function to get similarity matrix from entries
    """
    cleaned_tweet = [clean_tweets(one_tweet) for one_tweet in original_tweets] 

    #define and use tf-idf transformation
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0)
    tf_idf = tf.fit_transform(cleaned_tweet)

    # get cosine similarity matrix
    cosine_similarity_matrix = linear_kernel(tf_idf, tf_idf)
    return cosine_similarity_matrix

def build_graph(cosine_similarity_matrix):
    """
    function to build graoh from similarity matrix
    """
    graph_one_lang = nx.Graph()
    matrix_shape = cosine_similarity_matrix.shape
    for i in range (matrix_shape[0]):
        for j in range (matrix_shape[1]):
            #do only once
            if i < j:
                sim = cosine_similarity_matrix[i, j]
                graph_one_lang.add_edge(i, j, weight=sim)
                graph_one_lang.add_edge(j, i, weight=sim)

    return graph_one_lang

def get_sentences_to_omit(original_tweets: List[str]):
    cosine_similarity_matrix = get_similarity_matrix(original_tweets)
    too_similar_ids = np.argwhere(cosine_similarity_matrix > 0.6)
    sentences_to_omit = []
    for pair_ids in too_similar_ids:
        if pair_ids[0]<pair_ids[1]:
            sentences_to_omit.append(pair_ids[1])

    return sentences_to_omit

def get_summary(df: pd.DataFrame, tag: str):

    """
    main function used for getting summary
    """

    df_one_sector = df[df.sectors.apply(lambda x: tag in x)][['entry_id', 'excerpt']]

    final_summary = []

    original_tweets = df_one_sector.excerpt.tolist() 
    tweet_ids = df_one_sector.entry_id.tolist() 
    n_entries = len(original_tweets)    

    # omit too similar entries
    
    sentence_ids_to_omit = get_sentences_to_omit(original_tweets)

    new_excerpts = [original_tweets[i] for i in range (n_entries) if i not in sentence_ids_to_omit]
    new_ids = [tweet_ids[i] for i in range (n_entries) if i not in sentence_ids_to_omit] 
    # get cosine similarity matrix
    cosine_similarity_matrix = get_similarity_matrix(new_excerpts)

    # create graph from similarity matrix
    graph_one_lang = build_graph(cosine_similarity_matrix)

    # louvain community
    partition = community.best_partition(graph_one_lang)

    ids = []
    tweets = []
    partitions = []
    for key, val in partition.items():
        ids.append(new_ids[key])
        tweets.append(new_excerpts[key])
        partitions.append(val)

    df_partition = pd.DataFrame(
        list(zip(
            ids, 
            tweets,
            partitions
            )),
        
        columns=['entry_id', 'excerpt', 'partition']
    ).sort_values(by='partition', inplace=False)

    #res: dict where key is the group of the sentence and value is a list of ids of that group
    res = defaultdict(list)
    for key, val in sorted(partition.items()):
        res[val].append(key)

    for key, val in res.items():
        df_one_part = df_partition[df_partition.partition==key]
        sentences = df_one_part.excerpt
        similarity_one_item = get_similarity_matrix(sentences)

        graph_one_lang = build_graph(similarity_one_item)

        scores = nx.pagerank(graph_one_lang)

        ranked_sentence = ' '.join(
            list(
                dict(sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)[:3]).values()
                )
        )

        changed_text  = f'summarize: {ranked_sentence}'
        input_ids = tokenizer(changed_text, return_tensors="pt", truncation=False).input_ids
        outputs = model.generate(input_ids, max_length=256)
        summarized_entries = tokenizer.decode(outputs[0], skip_special_tokens=True)

        #clean summary to omit too similar senteneces
        summarized_entries_as_sentences = nltk.tokenize.sent_tokenize(summarized_entries)
        sentence_ids_to_omit = get_sentences_to_omit(summarized_entries_as_sentences)
        text_one_cluster = [
            summarized_entries_as_sentences[i] for i in range (len(summarized_entries_as_sentences))\
                if i not in sentence_ids_to_omit
            ]
        clean_summary_one_cluster = ' '.join(
            [
            text_one_cluster[i].title() if i!=0 else text_one_cluster[0] for i in range (len(text_one_cluster))
            ]
            
        )

        final_summary.append(clean_summary_one_cluster)

    n_clusters = len(final_summary)
    final_summary_str = ''

    first_sent_link_tmp = first_sent_link.copy()
    final_sent_link_tmp = final_sent_link.copy()
    between_sequence_links_tmp = between_sequence_links.copy()

    for i in range (n_clusters):
        
        if i == 0:
            link_word = random.choice(first_sent_link_tmp)
            first_sent_link_tmp.remove(link_word)
        elif i == (n_clusters-1):
            link_word = random.choice(final_sent_link_tmp)
            final_sent_link_tmp.remove(link_word)
        else:
            link_word = random.choice(between_sequence_links_tmp)
            between_sequence_links_tmp.remove(link_word)

        final_summary_str = final_summary_str + link_word + final_summary[i] + '\n'
    
    return final_summary_str

In [30]:
columns = ['Livelihoods', 'Food Security', 'Health', 'Protection']
for tag in columns:
    print(tag)
    partitions = get_summary(libya_entries, tag)
    print(partitions)
    print('')

Livelihoods
First of all, two-thirds of migrants surveyed had to resort to coping strategy in 30 days. They Had To Resort To Stress, Crisis Or Emergency Livelihood Coping Strategy. Most Refugees And Migrants In Libya Are Unable To Find Work Necessary To Support Themselves And Meet Their Food Needs.
Besides, a number of communities in the region have been affected by the loss of income. The Loss Of Income Continues To Impact People’S Ability To Cover Basic Needs And Pay Rent.
Furthermore, key informants in all municipalities reported that residents were negatively affected. The Negative Impact On Residents Ranged From Difficulties Faced In Accessing Work And Livelihood Opportunities.
Ultimately, the lack of liquidity to buy necessary inputs, including medicine, fodder and live animals were mentioned as the main problems affecting livestock production. The Lack Of Liquidity To Buy Necessary Inputs, Including Medicine, Fodder And Live Animals Was Experienced By Most Livestock Herders Prio