In [3]:
import pandas as pd
import os
from ast import literal_eval
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration


In [90]:
def flatten(t):
    return [item for sublist in t for item in sublist]

DATA_PATH = os.path.join(
    '..', '..', '..', '..', "data", "frameworks_data", 'data_v0.7.1'
)

full_df = pd.read_csv(os.path.join(DATA_PATH, 'full_dataset_with_translations.csv'))
full_df['sectors'] = full_df.sectors.apply(literal_eval)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [28]:
libya_entries = full_df[full_df.project_id==1621]

libya_food_entries = libya_entries[libya_entries.sectors.apply(lambda x: 'Food Security' in x)][['entry_id', 'excerpt']]

In [4]:
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [11]:
from typing import List
import numpy as np
import pandas as pd

import re
import json

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

from nltk.stem import PorterStemmer  
porter_stemmer = PorterStemmer()

stop_words = set(stopwords.words())

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

import networkx as nx
import community.community_louvain as community
from ast import literal_eval
import re

[nltk_data] Downloading package stopwords to /home/selim/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/selim/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/selim/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [42]:
from collections import defaultdict

In [87]:
def clean_tweets(sentence):

    if type(sentence) is not str:
        sentence = str(sentence)

    new_words = []
    words = sentence.split()
    for word in words:
        
        #lower and remove punctuation
        new_word = re.sub(r'[^\w\s]', '', (word))

        #keep clean words and remove hyperlinks
        word_not_nothing = new_word != ''
        word_not_stop_word = new_word.lower() not in stop_words
        #word_not_digit = ~new_word.isdigit()

        if word_not_nothing and word_not_stop_word:

            #lemmatize
            new_word =  wordnet_lemmatizer.lemmatize(new_word, pos="v")  

            #stem
            new_word = porter_stemmer.stem(new_word)

            new_words.append(new_word)
            
    return ' '.join(new_words)

def get_similarity_matrix(original_tweets):
    """
    function to get similarity matrix from entries
    """
    cleaned_tweet = [clean_tweets(one_tweet) for one_tweet in original_tweets] 

    #define and use tf-idf transformation
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0)
    tf_idf = tf.fit_transform(cleaned_tweet)

    # get cosine similarity matrix
    cosine_similarity_matrix = linear_kernel(tf_idf, tf_idf)
    return cosine_similarity_matrix

def build_graph(cosine_similarity_matrix):
    """
    function to build graoh from similarity matrix
    """
    graph_one_lang = nx.Graph()
    matrix_shape = cosine_similarity_matrix.shape
    for i in range (matrix_shape[0]):
        for j in range (matrix_shape[1]):
            #do only once
            if i < j:
                sim = cosine_similarity_matrix[i, j]
                graph_one_lang.add_edge(i, j, weight=sim)
                graph_one_lang.add_edge(j, i, weight=sim)

    return graph_one_lang

between_sequence_links = ['Furthermore,', 'Aditionally,', 'Moreover,', 'Besides,', 'On top of that,',]

def get_summary(df: pd.DataFrame):

    final_summary = []

    original_tweets = df.excerpt.tolist() 
    tweet_ids = df.entry_id.tolist() 
    n_entries = len(original_tweets)

    # get cosine similarity matrix
    cosine_similarity_matrix = get_similarity_matrix(original_tweets)

    # omit too similar entries
    too_similar_ids = np.argwhere(cosine_similarity_matrix > 0.6)
    sentences_to_omit = []
    for pair_ids in too_similar_ids:
        if pair_ids[0]<pair_ids[1]:
            sentences_to_omit.append(pair_ids[1])

    print(len(sentences_to_omit))

    new_excerpts = [original_tweets[i] for i in range (n_entries) if i not in sentences_to_omit]
    new_ids = [tweet_ids[i] for i in range (n_entries) if i not in sentences_to_omit] 
    # get cosine similarity matrix
    cosine_similarity_matrix = get_similarity_matrix(new_excerpts)

    # create graph from similarity matrix
    graph_one_lang = build_graph(cosine_similarity_matrix)

    # louvain community
    partition = community.best_partition(graph_one_lang)

    ids = []
    tweets = []
    partitions = []
    for key, val in partition.items():
        ids.append(new_ids[key])
        tweets.append(new_excerpts[key])
        partitions.append(val)

    df_partition = pd.DataFrame(
        list(zip(
            ids, 
            tweets,
            partitions
            )),
        
        columns=['entry_id', 'excerpt', 'partition']
    ).sort_values(by='partition', inplace=False)

    #res: dict where key is the group of the sentence and value is a list of ids of that group
    res = defaultdict(list)
    for key, val in sorted(partition.items()):
        res[val].append(key)

    for key, val in res.items():
        df_one_part = df_partition[df_partition.partition==key]
        sentences = df_one_part.excerpt
        similarity_one_item = get_similarity_matrix(sentences)

        graph_one_lang = build_graph(similarity_one_item)

        scores = nx.pagerank(graph_one_lang)

        ranked_sentence = ' '.join(
            list(
                dict(sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)[:4]).values()
                )
        )

        changed_text  = f'summarize: {ranked_sentence}'
        input_ids = tokenizer(changed_text, return_tensors="pt", truncation=False).input_ids
        outputs = model.generate(input_ids, max_length=256)
        summarized_entries = tokenizer.decode(outputs[0], skip_special_tokens=True)

    

        final_summary.append(summarized_entries)


    return '\n'.join(final_summary)

In [91]:
partitions = get_summary(libya_food_entries)
print(partitions)

4
5
more than 3,200 migrants estimated to be in detention centres. of particular concern are those held in detention centres. of particular concern are those held in detention centres.
65% of migrants surveyed had to resort to coping strategies due to lack of food or means to buy food. one in five reported having to work in exchange for food (20%) and/or having to reduce expenditure on essential non-food items (19%).
food, shelter, health, non-food items and health assistance are the most identified needs by affected communities in eastern Libya. food, shelter, health, non-food items and WASH are the most identified needs by affected communities in eastern Libya.
a third of migrants are considered marginally food insecure (34%) and at risk of food insecurity. the ‘marginally food secure’ households have managed to meet the minimum food consumption through adopting livelihood coping strategies.
key informants in all municipalities (except tobruk) reported that residents were negatively 