# Semantic of News Miner

### Installing and Importing Libraries

In [1]:
# INSTALLING
!pip install transformers 
!pip install datasets
!pip install pynvml
!pip install evaluate 
!pip install sentencepiece
!pip install flair 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.1-py3-none-any.whl (6.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.2-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.2 tokenizers-0.13.2 transformers-4.27.1
Looking in indexes: https://pypi.org/simple, https://us

In [56]:
# IMPORTING
import transformers
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, AutoModelForSeq2SeqLM
import numpy as np
import os
import nltk
import torch
import evaluate
import sys
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
from sklearn.model_selection import train_test_split
import torch.cuda as cuda
import gc
from flair.models import SequenceTagger
from flair.data import Sentence
import csv, json

### Utils for GPU

In [8]:
# define utils functions to facilitate gpu 

def check_gpu_availability():
    # Check if CUDA is available
    print(f"Cuda is available: {torch.cuda.is_available()}")

def getting_device(gpu_prefence=True) -> torch.device:
    """
    This function gets the torch device to be used for computations, 
    based on the GPU preference specified by the user.
    """
    
    # If GPU is preferred and available, set device to CUDA
    if gpu_prefence and torch.cuda.is_available():
        device = torch.device('cuda')
    # If GPU is not preferred or not available, set device to CPU
    else: 
        device = torch.device("cpu")
    
    # Print the selected device
    print(f"Selected device: {device}")
    
    # Return the device
    return device

# Define a function to print GPU memory utilization
def print_gpu_utilization():
    # Initialize the PyNVML library
    nvmlInit()
    # Get a handle to the first GPU in the system
    handle = nvmlDeviceGetHandleByIndex(0)
    # Get information about the memory usage on the GPU
    info = nvmlDeviceGetMemoryInfo(handle)
    # Print the GPU memory usage in MB
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

# Define a function to print training summary information
def print_summary(result):
    # Print the total training time in seconds
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    # Print the number of training samples processed per second
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    # Print the GPU memory utilization
    print_gpu_utilization()

def clean_gpu():
    # Get current GPU memory usage
    print("BEFORE CLEANING:")
    print(f"Allocated: {cuda.memory_allocated() / 1024 ** 3:.2f} GB")
    print(f"Cached: {cuda.memory_cached() / 1024 ** 3:.2f} GB")
    print("\n")
    # Free up PyTorch and CUDA memory
    torch.cuda.empty_cache()
    cuda.empty_cache()
    
    # Run garbage collection to free up other memory
    gc.collect()
    
    # Get new GPU memory usage
    print("AFTER CLEANING:")
    print(f"Allocated: {cuda.memory_allocated() / 1024 ** 3:.2f} GB")
    print(f"Cached: {cuda.memory_cached() / 1024 ** 3:.2f} GB")

In [9]:
# CHECK IF GPU IS UP
check_gpu_availability()

Cuda is available: True


In [10]:
# SAVE THE DEVICE WE ARE WORKING WITH
device = getting_device(gpu_prefence=True)

Selected device: cuda


### Reading the data

The dataframe must have a column called "story", this column will be used to mine the knowledge graph. Ideally, each entry is a news.

In [11]:
# ACCESSING THE DATASET 
path2data = "/content/drive/MyDrive/KGML/DWIE_validation.json"
df = pd.read_json(path2data)

In [12]:
df.head(10)

Unnamed: 0,story,Instances Knowledge Graph,Types Knowledge Graph,Subclass Knowledge Graph
0,Rajapaksa 'politically vulnerable' ahead of ea...,International Crisis Group - based_in0 - Unite...,International Crisis Group - type - entity | I...,ngo - subclass_of - org | ethnicity - subclass...
1,France's Peugeot 'back in the race' as profit ...,PSA Peugeot Citroen - based_in0 - France | PSA...,PSA Peugeot Citroen - type - company | PSA Peu...,company - subclass_of - org | gpe - subclass_o...
2,Germans take to the streets to protest anti-Is...,Patriotic Europeans against the Islamization o...,Islamic - type - entity | Islamic - type - mis...,misc - subclass_of - entity | religion - subcl...
3,'Infrastructure can drive up disaster vulnerab...,Deutsche Welle - based_in0 - Germany | Deutsch...,Deutsche Welle - type - entity | Deutsche Well...,media - subclass_of - org | igo - subclass_of ...
4,'US and Germany can't afford chasm of trust' T...,Barack Obama - agent_of - United States | Bara...,United States - type - entity | United States ...,gpe - subclass_of - location | gpe0 - subclass...
5,MoU not enough to protect Indonesian maids Ind...,Memorandum of Understanding - signed_by - Indo...,Memorandum of Understanding - type - entity | ...,misc - subclass_of - entity | treaty - subclas...
6,Bhopal - an endless disaster Thousands died in...,Bhopal - in0 - India | Bhopal - in0-x - Indian...,Bhopal - type - entity | Bhopal - type - gpe |...,gpe - subclass_of - location | gpe2 - subclass...
7,Rampant Bayern dismiss United's slump ahead of...,Manchester United - appears_in - Champions Lea...,Manchester United - type - entity | Manchester...,sport_team - subclass_of - org | competition -...
8,Italy Rejects Easing Assisted Fertility A boyc...,Gianni Baget Bozzo - member_of - Roman Catholi...,Roman Catholic Church - type - entity | Roman ...,religion_org - subclass_of - org | gpe - subcl...
9,Oxfam: G7 emissions have 'savage impact' on Af...,Oxfam - based_in0 - Britain | Kiri Hanks - mem...,Britain - type - entity | Britain - type - gpe...,gpe - subclass_of - location | gpe0 - subclass...


### Mining Type

In [13]:
def mining_type_of_news(df,model_name = "abhishek/autonlp-bbc-news-classification-37229289"):

  # Getting Device
  device = getting_device(gpu_prefence=True)

  # Load the tokenizer and model from Hugging Face
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  # load the model
  model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

  # Load the configuration of the model
  config = AutoConfig.from_pretrained(model_name)
  labels = config.label2id.keys()

  # Create a list to hold the predicted labels
  predicted_type_of_news= []

  # Loop over each story in the DataFrame
  max_length = tokenizer.model_max_length

  #TODO: make it better (bc max input of those models is 512)
  for story in df["story"]:

      # Encode the story text using the tokenizer
      inputs = tokenizer.encode(story, max_length= max_length, return_offsets_mapping=False, stride=0, return_tensors='pt').to(device)
      
      # Make a prediction using the model
      outputs = model(inputs)

      # Get the predicted label
      predicted_label_id = outputs.logits.argmax().item()
      
      predicted_label = list(labels)[predicted_label_id]

      # Add the predicted label to the list
      predicted_type_of_news.append(predicted_label)

  # adding the mined info as column of our data
  df["predicted_label1"] = predicted_type_of_news
  
  return df


In [14]:
df1 = mining_type_of_news(df)
df1.head()

Selected device: cuda


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Unnamed: 0,story,Instances Knowledge Graph,Types Knowledge Graph,Subclass Knowledge Graph,predicted_label1
0,Rajapaksa 'politically vulnerable' ahead of ea...,International Crisis Group - based_in0 - Unite...,International Crisis Group - type - entity | I...,ngo - subclass_of - org | ethnicity - subclass...,business
1,France's Peugeot 'back in the race' as profit ...,PSA Peugeot Citroen - based_in0 - France | PSA...,PSA Peugeot Citroen - type - company | PSA Peu...,company - subclass_of - org | gpe - subclass_o...,business
2,Germans take to the streets to protest anti-Is...,Patriotic Europeans against the Islamization o...,Islamic - type - entity | Islamic - type - mis...,misc - subclass_of - entity | religion - subcl...,politics
3,'Infrastructure can drive up disaster vulnerab...,Deutsche Welle - based_in0 - Germany | Deutsch...,Deutsche Welle - type - entity | Deutsche Well...,media - subclass_of - org | igo - subclass_of ...,tech
4,'US and Germany can't afford chasm of trust' T...,Barack Obama - agent_of - United States | Bara...,United States - type - entity | United States ...,gpe - subclass_of - location | gpe0 - subclass...,politics


In [15]:
clean_gpu()

BEFORE CLEANING:
Allocated: 0.00 GB
Cached: 3.74 GB


AFTER CLEANING:
Allocated: 0.00 GB
Cached: 0.00 GB




### Mining Summary

In [16]:
def mining_summary(df,model_name = "google/pegasus-multi_news"):
  
  # Getting Device
  device = getting_device(gpu_prefence=True)

  # Load the tokenizer and model from Hugging Face
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  # load the model
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

  # Create a list to hold the summaries
  summaries = []

  # Loop through the stories
  for story in df["story"]:

      # Tokenize the story
      inputs = tokenizer.encode(story, return_tensors="pt", max_length=1024, truncation=True).to(device)

      # Generate the summary
      outputs = model.generate(inputs, max_length=30, min_length=1, length_penalty=15.0, num_beams=4, early_stopping=True)
      
      # Decode the summary and add it to the list
      summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

      # append to the summary list
      summaries.append(summary)
    
  # Add the summaries to the dataframe
  df["core description"] = summaries

  # REMOVE IF IT CREATES PROBLEM WITH YOUR PARTICULAR DATASET
  df["core description"] = df["core description"].apply(lambda x: x[1:])

  return df

In [17]:
df2 = mining_summary(df1)
df2.head()

Selected device: cuda


Unnamed: 0,story,Instances Knowledge Graph,Types Knowledge Graph,Subclass Knowledge Graph,predicted_label1,core description
0,Rajapaksa 'politically vulnerable' ahead of ea...,International Crisis Group - based_in0 - Unite...,International Crisis Group - type - entity | I...,ngo - subclass_of - org | ethnicity - subclass...,business,Sri Lanka's President Mahinda Rajapaksa is se...
1,France's Peugeot 'back in the race' as profit ...,PSA Peugeot Citroen - based_in0 - France | PSA...,PSA Peugeot Citroen - type - company | PSA Peu...,company - subclass_of - org | gpe - subclass_o...,business,France's PSA Peugeot Citroen has made a profi...
2,Germans take to the streets to protest anti-Is...,Patriotic Europeans against the Islamization o...,Islamic - type - entity | Islamic - type - mis...,misc - subclass_of - entity | religion - subcl...,politics,Tens of thousands of people took to the stree...
3,'Infrastructure can drive up disaster vulnerab...,Deutsche Welle - based_in0 - Germany | Deutsch...,Deutsche Welle - type - entity | Deutsche Well...,media - subclass_of - org | igo - subclass_of ...,tech,"The UN's annual World Risk Report is out, and..."
4,'US and Germany can't afford chasm of trust' T...,Barack Obama - agent_of - United States | Bara...,United States - type - entity | United States ...,gpe - subclass_of - location | gpe0 - subclass...,politics,The NSA's alleged tapping of Angela Merkel's ...


In [18]:
clean_gpu()

BEFORE CLEANING:
Allocated: 0.00 GB
Cached: 2.72 GB


AFTER CLEANING:
Allocated: 0.00 GB
Cached: 0.00 GB




### Mining Entities

In [12]:
# Getting the right format and unique entities
def extract_named_unique_entities_with_filters(text,ner_name = "ner-ontonotes-fast", pos_name = "pos-fast"):
    # Load the NER model
    taggerNer = SequenceTagger.load(ner_name)
    taggerPos = SequenceTagger.load(pos_name)
    sentence = Sentence(text)
    taggerNer.predict(sentence)
    taggerPos.predict(sentence)
    entities = []
    for entity in sentence.get_spans('ner'):
        entity_text = entity.text
        entity_type = entity.labels[0].value
        tokens = entity.tokens
        pos_label = [token.get_labels()[0].value for token in tokens][0]

        # FILTERING OUT ADJECTIVES 
        if pos_label != "JJ" and pos_label != "JJR":
          if entity_type != "PERCENT" and entity_type != "QUANTITY" and entity_type != "CARDINAL":
            entities.append((entity.text,entity.labels[0].value))  
    entities = tuple(set(entities))
    output = ''
    for entity in entities:
      output += entity[0] + ' - type - ' + entity[1] + ' | '
    output = output[:-3] + '' 
    return output


In [21]:
def mining_entites(df):

  # apply to dataframe
  df['mined_kg_entities'] = df['story'].apply(extract_named_unique_entities_with_filters)

  return df


In [22]:
df3 = mining_entites(df2)
df3.head()

2023-03-15 22:09:45,589 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PRODUCT, I-PRODUCT, E-PRODUCT, B-WORK_OF_ART, I-WORK_OF_ART, E-WORK_OF_ART, B-PERSON, E-PERSON, S-GPE, B-DATE, I-DATE, E-DATE, S-ORDINAL, S-LANGUAGE, I-PERSON, S-EVENT, S-DATE, B-QUANTITY, E-QUANTITY, S-TIME, B-TIME, I-TIME, E-TIME, B-GPE, E-GPE, S-ORG, I-GPE, S-NORP, B-FAC, I-FAC, E-FAC, B-NORP, E-NORP, S-PERCENT, B-ORG, E-ORG, B-LANGUAGE, E-LANGUAGE, I-CARDINAL, I-ORG, S-WORK_OF_ART, I-QUANTITY, B-MONEY
2023-03-15 22:09:46,172 SequenceTagger predicts: Dictionary with 53 tags: <unk>, O, UH, ,, VBD, PRP, VB, PRP$, NN, RB, ., DT, JJ, VBP, VBG, IN, CD, NNS, NNP, WRB, VBZ, WDT, CC, TO, MD, VBN, WP, :, RP, EX, JJR, FW, XX, HYPH, POS, RBR, JJS, PDT, NNPS, RBS, AFX, WP$, -LRB-, -RRB-, ``, '', LS, $, SYM, ADD
2023-03-15 22:09:56,887 SequenceTagger predicts: Dictionary with 76 tags: <unk>, O, B-CARDINAL, E-CARDINAL, S-PERSON, S-CARDINAL, S-PRODUCT, B-PR

In [30]:
clean_gpu()

BEFORE CLEANING:
Allocated: 0.00 GB
Cached: 0.55 GB






AFTER CLEANING:
Allocated: 0.00 GB
Cached: 0.00 GB


### Composing The Knowledge Graph

In [46]:
# UTILS 4 PRINTING THE DATAFRAME INFORMATION

def check_it_function(dataframe, index):

  for j,i in enumerate(dataframe.iloc[index]):

    print(f"{dataframe.columns[j]}","=>",i,"\n")
    
# check_it_function(df3,0)

In [47]:
# EXTRACT TRIPLES

def extract_triples(text):
    # Split the string into individual triples
    triples = [t.strip() for t in text.split('|')]
    # Split each triple into its constituent parts
    triples = [tuple(t.split('- type - ')) for t in triples]
    return triples

In [48]:
def column_extracting_triples(df):
  df['triple_column'] = df['mined_kg_entities'].apply(extract_triples)

  # REMOVING ORDINAL AND LANGUAGE
  df['triple_column'] = [[t for t in row if len(t) > 1 if t[1] not in ['ORDINAL', 'LANGUAGE']] for row in df['triple_column'] ]
  return df

In [49]:
def extract_triples_from_tuples(df):
  new_triples = []
  for row in df['triple_column']:
      new_row = []
      for triple in row:
          if triple[1] == 'WORK_OF_ART':
              new_row.append(('news', 'hasItem', triple[0]))
              new_row.append((triple[0], 'type', 'work of art'))
          elif triple[1] == 'LAW':
              new_row.append(('news', 'hasItem', triple[0]))
              new_row.append((triple[0], 'type', 'law'))
          elif triple[1] == 'FAC':
              new_row.append(('news', 'hasItem', triple[0]))
              new_row.append((triple[0], 'type', 'facility'))
          elif triple[1] == 'MONEY':
              new_row.append(('news', 'hasItem', triple[0]))
              new_row.append((triple[0], 'type', 'money'))
          elif triple[1] == 'PRODUCT':
              new_row.append(('news', 'hasItem', triple[0]))
              new_row.append((triple[0], 'type', 'product'))
          elif triple[1] == 'TIME':
              new_row.append(('news', 'hasTime', triple[0]))
              new_row.append((triple[0], 'type', 'time'))
          elif triple[1] == 'DATE':
              new_row.append(('news', 'hasTime', triple[0]))
              new_row.append((triple[0], 'type', 'time'))
          elif triple[1] == 'LOC':
              new_row.append(('news', 'hasPlace', triple[0]))
              new_row.append((triple[0], 'type', 'place'))
          elif triple[1] == 'EVENT':
              new_row.append(('news', 'hasEvent', triple[0]))
              new_row.append((triple[0], 'type', 'event'))
          elif triple[1] == 'PERSON':
              new_row.append(('news', 'hasActor', triple[0]))
              new_row.append((triple[0], 'type', 'person'))
          elif triple[1] == 'NORP':
              new_row.append(('news', 'hasActor', triple[0]))
              new_row.append((triple[0], 'type', 'Nationalities or Religious or Political Groups'))
          elif triple[1] == 'ORG':
              new_row.append(('news', 'hasActor', triple[0]))
              new_row.append((triple[0], 'type', 'organization'))
          elif triple[1] == 'GEO':
              new_row.append(('news', 'hasActor', triple[0]))
              new_row.append((triple[0], 'type', 'Geo-Political Entity'))
      new_triples.append(new_row)

  df['new_triples'] = new_triples
  df['final_triples'] = ["{" + " | ".join([f"{triple[0]} - {triple[1]} - {triple[2]}" for triple in row]) + "}" for row in df['new_triples']]
  return df

In [50]:
def get_final_kg(df):
  df["final_triples"] = df["final_triples"].apply(lambda x: x[1:-1])
  df["semantic_of_news"] = "{ "  + " news - type - " + df["predicted_label1"] + " | "+  df["final_triples"] + " | news - hasCore - " + "'" + df["core description"] + "'" + " }"
  return df

In [53]:
# TO DELETE
df3 = pd.read_csv("/content/DWIE_train_entities_topic_summary_news.csv")
df3["mined_kg_entities"] = df3["mined_kg_entities"].apply(lambda x: x[1:-1])
df3 =  column_extracting_triples(df3)
df3 = extract_triples_from_tuples(df3)
df3 = get_final_kg(df3)
df3.columns

Index(['Unnamed: 0', 'story', 'Instances Knowledge Graph',
       'Types Knowledge Graph', 'Subclass Knowledge Graph', 'predicted_label1',
       'core description', 'mined_kg_entities', 'triple_column', 'new_triples',
       'final_triples', 'semantic_of_news'],
      dtype='object')

### Getting the Files

In [54]:
def get_csv_with_mined_semantic(df,path):
  df.drop(['Unnamed: 0','predicted_label1',
        'core description', 'mined_kg_entities', 'triple_column', 'new_triples',
        'final_triples'], axis = 1, inplace = True) 
  df.to_csv(path, index=False)

In [55]:
def get_csv_with_mined_semantic_concatenated_kginstances(df,path):
  df.drop(['Unnamed: 0','predicted_label1',
        'core description', 'mined_kg_entities', 'triple_column', 'new_triples',
        'final_triples'], axis = 1, inplace = True) 
  
  df["InstancesKG+NewsKG"] = "{" +df["Instances Knowledge Graph"] + df["semantic_of_news"].apply(lambda x: x[2:-1]) + "}"

  df.to_csv(path, index=False)

In [None]:
def to_json_format(json_filename, csv_filename):
    with open(csv_filename, newline='') as csvfile:
      reader = csv.reader(csvfile)
      columns = next(reader)
    with open(json_filename, "w") as jsonfile:
        jsonfile.write('[')
        for row in csv.DictReader(open(csv_filename), fieldnames=columns):
            json.dump(row, jsonfile, indent = 4)
            jsonfile.write(',')
            jsonfile.write('\n')
        jsonfile.write('{}')    
        jsonfile.write(']')