# Semantic of News Miner

### Installing and Importing Libraries

In [1]:
# INSTALLING
!pip install inltk
!pip install transformers 
!pip install datasets
!pip install pynvml
!pip install evaluate 
!pip install sentencepiece
!pip install flair 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting inltk
  Downloading inltk-0.9-py3-none-any.whl (13 kB)
Collecting nvidia-ml-py3
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fastai==1.0.57
  Downloading fastai-1.0.57-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.3/233.3 KB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp>=3.5.4
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting asyn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m66.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m98.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us

In [2]:
# IMPORTING
import transformers
from datasets import load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, AutoModelForSeq2SeqLM
import numpy as np
import os
import nltk
import torch
import evaluate
import sys
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict
from pynvml import nvmlInit, nvmlDeviceGetHandleByIndex, nvmlDeviceGetMemoryInfo
from sklearn.model_selection import train_test_split
import torch.cuda as cuda
import gc
from flair.models import SequenceTagger
from flair.data import Sentence
import csv, json
from transformers import pipeline

### Utils for GPU

In [3]:
# define utils functions to facilitate gpu 

def check_gpu_availability():
    # Check if CUDA is available
    print(f"Cuda is available: {torch.cuda.is_available()}")

def getting_device(gpu_prefence=True) -> torch.device:
    """
    This function gets the torch device to be used for computations, 
    based on the GPU preference specified by the user.
    """
    
    # If GPU is preferred and available, set device to CUDA
    if gpu_prefence and torch.cuda.is_available():
        device = torch.device('cuda')
    # If GPU is not preferred or not available, set device to CPU
    else: 
        device = torch.device("cpu")
    
    # Print the selected device
    print(f"Selected device: {device}")
    
    # Return the device
    return device

# Define a function to print GPU memory utilization
def print_gpu_utilization():
    # Initialize the PyNVML library
    nvmlInit()
    # Get a handle to the first GPU in the system
    handle = nvmlDeviceGetHandleByIndex(0)
    # Get information about the memory usage on the GPU
    info = nvmlDeviceGetMemoryInfo(handle)
    # Print the GPU memory usage in MB
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

# Define a function to print training summary information
def print_summary(result):
    # Print the total training time in seconds
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    # Print the number of training samples processed per second
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    # Print the GPU memory utilization
    print_gpu_utilization()

def clean_gpu():
    # Get current GPU memory usage
    print("BEFORE CLEANING:")
    print(f"Allocated: {cuda.memory_allocated() / 1024 ** 3:.2f} GB")
    print(f"Cached: {cuda.memory_cached() / 1024 ** 3:.2f} GB")
    print("\n")
    # Free up PyTorch and CUDA memory
    torch.cuda.empty_cache()
    cuda.empty_cache()
    
    # Run garbage collection to free up other memory
    gc.collect()
    
    # Get new GPU memory usage
    print("AFTER CLEANING:")
    print(f"Allocated: {cuda.memory_allocated() / 1024 ** 3:.2f} GB")
    print(f"Cached: {cuda.memory_cached() / 1024 ** 3:.2f} GB")

In [4]:
# CHECK IF GPU IS UP
check_gpu_availability()

Cuda is available: True


In [5]:
# SAVE THE DEVICE WE ARE WORKING WITH
device = getting_device(gpu_prefence=True)

Selected device: cuda


### Reading the data

In [6]:
# RUN IFF THE FILES ARE ON DRIVE
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


The dataframe must have a column called "story", this column will be used to mine the knowledge graph. Ideally, each entry is a news.

In [7]:
# ACCESSING THE DATASET 
path2data = "/content/drive/MyDrive/GITHUB: KGNarrative/last/WebNLG/57_triples/train_57.json"
df = pd.read_json(path2data)

In [None]:
df.head(10)

Unnamed: 0,story,Types_KG,Instances_KG,Subclasses_KG
0,Estádio Municipal Coaracy da Mata Fonseca is t...,Arapiraca | type | place - Campeonato_Brasilei...,Estádio_Municipal_Coaracy_da_Mata_Fonseca | lo...,place | subclass | None - soccer league | subc...
1,"Nie Haisheng, born on October 13, 1964, worked...",Nie_Haisheng | type | person - Fighter_pilot |...,Nie_Haisheng | birthDate | 1964-10-13 - Nie_Ha...,person | subclass | animal - None | subclass |...
2,MotorSport Vision is located in the city of Fa...,MotorSport_Vision | type | company - Fawkham |...,MotorSport_Vision | locationCity | Fawkham,place | subclass | None - company | subclass |...
3,185 centimetre tall Aleksandr Prudnikov played...,Otkrytiye_Arena | type | stadium - Aleksandr_P...,"Aleksandr_Prudnikov | Person/height | ""185.0""^...",person | subclass | animal - agent | subclass ...
4,Ciudad Ayala is a city with population density...,City_Manager | type | person function - Counci...,Ciudad_Ayala | populationMetro | 1777539 - Ciu...,place | subclass | None - administrative regio...
5,The album 1969: The Velvet Underground Live is...,Squeeze_(The_Velvet_Underground_album) | type ...,Bootleg_Series_Volume_1:_The_Quine_Tapes | pre...,work | subclass | None
6,Ciudad Ayala in Mexico with population density...,Governator | type | None - Mexico | type | per...,Ciudad_Ayala | populationDensity | 1604.0 - Ci...,person | subclass | animal - place | subclass ...
7,"Olga Bondareva died on December 9, 1991.",1991-12-09 | type | None - Olga_Bondareva | ty...,Olga_Bondareva | deathDate | 1991-12-01 - Olga...,person | subclass | animal - None | subclass |...
8,"Saint Petersburg was founded on May 27, 1703 a...",1439.0 | type | None - Saint_Petersburg | type...,Saint_Petersburg | foundingDate | 1703-05-27 -...,place | subclass | None - None | subclass | None
9,The population density of Ciudad Ayala is 1604.0.,Ciudad_Ayala | type | place - 1604.0 | type | ...,Ciudad_Ayala | populationDensity | 1604.0 - Ci...,place | subclass | None - None | subclass | None


### Mining Type

In [None]:
def mining_type_of_news(df,model_name ='facebook/bart-large-mnli'):

  # import pipeline on gpu
  classifier = pipeline("zero-shot-classification", model = model_name, device = 0)

  # type of news 
  candidate_labels = ["Tech", "Entertainment", "Sport", "Business", "Politics"]  
  
  # Create a list to hold the predicted labels
  predicted_type_of_news= []

  for story in df["story"]:

      # saving the story we need to classify
      sequence_to_classify = story
      
      # using NLI to classify
      prediction = classifier(sequence_to_classify, candidate_labels)

      # accessing the main label 
      predicted_label = prediction["labels"][0]
      # Add the predicted label to the list
      predicted_type_of_news.append(predicted_label)

  # adding the mined info as column of our data
  df["predicted_label1"] = predicted_type_of_news
  
  return df


In [None]:
df1 = mining_type_of_news(df)




In [None]:
(df["predicted_label1"] != "Tech").sum()

359

In [None]:
clean_gpu()

BEFORE CLEANING:
Allocated: 0.01 GB
Cached: 1.57 GB


AFTER CLEANING:
Allocated: 0.01 GB
Cached: 0.02 GB




### Mining Summary

In [None]:
def mining_summary(df,model_name = "deep-learning-analytics/automatic-title-generation"): # old one =>"google/pegasus-multi_news"
  
  # Getting Device
  device = getting_device(gpu_prefence=True)

  # Load the tokenizer and model from Hugging Face
  tokenizer = AutoTokenizer.from_pretrained(model_name)

  # load the model
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

  # Create a list to hold the summaries
  summaries = []

  # Loop through the stories
  for story in df["story"]:

      # Tokenize the story
      inputs = tokenizer.encode(story, return_tensors="pt", max_length=1024, truncation=True).to(device)

      # Generate the summary
      outputs = model.generate(inputs, max_length=30, min_length=1, length_penalty=15.0, num_beams=4, early_stopping=True)
      
      # Decode the summary and add it to the list
      summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

      # append to the summary list
      summaries.append(summary)
    
  # Add the summaries to the dataframe
  df["core description"] = summaries

  # REMOVE IF IT CREATES PROBLEM WITH YOUR PARTICULAR DATASET
  # df["core description"] = df["core description"].apply(lambda x: x[1:])

  return df

In [None]:
df2 = mining_summary(df1)
df2.head()

Selected device: cuda


Unnamed: 0,story,Types_KG,Instances_KG,Subclasses_KG,predicted_label1,core description
0,Estádio Municipal Coaracy da Mata Fonseca is t...,Arapiraca | type | place - Campeonato_Brasilei...,Estádio_Municipal_Coaracy_da_Mata_Fonseca | lo...,place | subclass | None - soccer league | subc...,Sport,The ground of Agremiaço Sportiva Arapiraquense...
1,"Nie Haisheng, born on October 13, 1964, worked...",Nie_Haisheng | type | person - Fighter_pilot |...,Nie_Haisheng | birthDate | 1964-10-13 - Nie_Ha...,person | subclass | animal - None | subclass |...,Tech,"Nie Haisheng, a Vietnam War veteran, died on O..."
2,MotorSport Vision is located in the city of Fa...,MotorSport_Vision | type | company - Fawkham |...,MotorSport_Vision | locationCity | Fawkham,place | subclass | None - company | subclass |...,Sport,MotorSport Vision in Fawkham.
3,185 centimetre tall Aleksandr Prudnikov played...,Otkrytiye_Arena | type | stadium - Aleksandr_P...,"Aleksandr_Prudnikov | Person/height | ""185.0""^...",person | subclass | animal - agent | subclass ...,Sport,"Aleksandr Prudnikov, 185 cm tall, played for S..."
4,Ciudad Ayala is a city with population density...,City_Manager | type | person function - Counci...,Ciudad_Ayala | populationMetro | 1777539 - Ciu...,place | subclass | None - administrative regio...,Tech,Ciudad Ayala is a city in the United Arab Emir...


In [None]:
clean_gpu()

BEFORE CLEANING:
Allocated: 0.01 GB
Cached: 1.01 GB






AFTER CLEANING:
Allocated: 0.01 GB
Cached: 0.02 GB


### Mining Entities

In [8]:
# Getting the right format and unique entities
def extract_named_unique_entities_with_filters(text,ner_name = "ner-ontonotes-fast", pos_name = "pos-fast"):
    # Load the NER model
    taggerNer = SequenceTagger.load(ner_name)
    taggerPos = SequenceTagger.load(pos_name)
    sentence = Sentence(text)
    taggerNer.predict(sentence)
    taggerPos.predict(sentence)
    entities = []
    for entity in sentence.get_spans('ner'):
        entity_text = entity.text
        entity_type = entity.labels[0].value
        tokens = entity.tokens
        pos_label = [token.get_labels()[0].value for token in tokens][0]

        # FILTERING OUT ADJECTIVES 
        if pos_label != "JJ" and pos_label != "JJR":
          if entity_type != "PERCENT" and entity_type != "QUANTITY" and entity_type != "CARDINAL":
            entities.append((entity.text,entity.labels[0].value))  
    entities = tuple(set(entities))
    output = ''
    for entity in entities:
      output += entity[0] + ' - type - ' + entity[1] + ' | '
    output = output[:-3] + '' 
    return output


In [9]:
def mining_entites(df):

  # apply to dataframe
  df['mined_kg_entities'] = df['story'].apply(extract_named_unique_entities_with_filters)

  return df


In [10]:
df3 = mining_entites(df2)
df3.head()

Downloading pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

AttributeError: ignored

In [11]:
clean_gpu()

BEFORE CLEANING:
Allocated: 0.00 GB
Cached: 0.00 GB


AFTER CLEANING:
Allocated: 0.00 GB
Cached: 0.00 GB




### Composing The Knowledge Graph

In [None]:
# UTILS 4 PRINTING THE DATAFRAME INFORMATION

def check_it_function(dataframe, index):

  for j,i in enumerate(dataframe.iloc[index]):

    print(f"{dataframe.columns[j]}","=>",i,"\n")
    
# check_it_function(df3,0)

In [None]:
# EXTRACT TRIPLES

def extract_triples(text):
    # Split the string into individual triples
    triples = [t.strip() for t in text.split('|')]
    # Split each triple into its constituent parts
    triples = [tuple(t.split('- type - ')) for t in triples]
    return triples

In [None]:
def column_extracting_triples(df):
  df['triple_column'] = df['mined_kg_entities'].apply(extract_triples)

  # REMOVING ORDINAL AND LANGUAGE
  df['triple_column'] = [[t for t in row if len(t) > 1 if t[1] not in ['ORDINAL', 'LANGUAGE']] for row in df['triple_column'] ]
  return df

In [None]:
def extract_triples_from_tuples(df):
  new_triples = []
  for row in df['triple_column']:
      new_row = []
      for triple in row:
          if triple[1] == 'WORK_OF_ART':
              new_row.append(('news', 'hasItem', triple[0]))
              new_row.append((triple[0], 'type', 'work of art'))
          elif triple[1] == 'LAW':
              new_row.append(('news', 'hasItem', triple[0]))
              new_row.append((triple[0], 'type', 'law'))
          elif triple[1] == 'FAC':
              new_row.append(('news', 'hasItem', triple[0]))
              new_row.append((triple[0], 'type', 'facility'))
          elif triple[1] == 'MONEY':
              new_row.append(('news', 'hasItem', triple[0]))
              new_row.append((triple[0], 'type', 'money'))
          elif triple[1] == 'PRODUCT':
              new_row.append(('news', 'hasItem', triple[0]))
              new_row.append((triple[0], 'type', 'product'))
          elif triple[1] == 'TIME':
              new_row.append(('news', 'hasTime', triple[0]))
              new_row.append((triple[0], 'type', 'time'))
          elif triple[1] == 'DATE':
              new_row.append(('news', 'hasTime', triple[0]))
              new_row.append((triple[0], 'type', 'time'))
          elif triple[1] == 'LOC':
              new_row.append(('news', 'hasPlace', triple[0]))
              new_row.append((triple[0], 'type', 'place'))
          elif triple[1] == 'EVENT':
              new_row.append(('news', 'hasEvent', triple[0]))
              new_row.append((triple[0], 'type', 'event'))
          elif triple[1] == 'PERSON':
              new_row.append(('news', 'hasActor', triple[0]))
              new_row.append((triple[0], 'type', 'person'))
          elif triple[1] == 'NORP':
              new_row.append(('news', 'hasActor', triple[0]))
              new_row.append((triple[0], 'type', 'Nationalities or Religious or Political Groups'))
          elif triple[1] == 'ORG':
              new_row.append(('news', 'hasActor', triple[0]))
              new_row.append((triple[0], 'type', 'organization'))
          elif triple[1] == 'GEO':
              new_row.append(('news', 'hasActor', triple[0]))
              new_row.append((triple[0], 'type', 'Geo-Political Entity'))
      new_triples.append(new_row)

  df['new_triples'] = new_triples
  df['final_triples'] = ["{" + " | ".join([f"{triple[0]} - {triple[1]} - {triple[2]}" for triple in row]) + "}" for row in df['new_triples']]
  return df

In [None]:
def get_final_kg(df):
  df["final_triples"] = df["final_triples"].apply(lambda x: x[1:-1])
  df["semantic_of_news"] = "{ "  + " news - type - " + df["predicted_label1"] + " | "+  df["final_triples"] + " | news - hasCore - " + "'" + df["core description"] + "'" + " }"
  return df

In [None]:
# TO DELETE
df3 = pd.read_csv("/content/DWIE_train_entities_topic_summary_news.csv")
df3["mined_kg_entities"] = df3["mined_kg_entities"].apply(lambda x: x[1:-1])
df3 =  column_extracting_triples(df3)
df3 = extract_triples_from_tuples(df3)
df3 = get_final_kg(df3)
df3.columns

Index(['Unnamed: 0', 'story', 'Instances Knowledge Graph',
       'Types Knowledge Graph', 'Subclass Knowledge Graph', 'predicted_label1',
       'core description', 'mined_kg_entities', 'triple_column', 'new_triples',
       'final_triples', 'semantic_of_news'],
      dtype='object')

### Getting the Files

In [None]:
def get_csv_with_mined_semantic(df,path):
  #df.drop(['Unnamed: 0','predicted_label1',
   #     'core description', 'mined_kg_entities', 'triple_column', 'new_triples',
    #    'final_triples'], axis = 1, inplace = True) 
  df.to_csv(path, index=False)

In [None]:
def get_csv_with_mined_semantic_concatenated_kginstances(df,path):
  df.drop(['Unnamed: 0','predicted_label1',
        'core description', 'mined_kg_entities', 'triple_column', 'new_triples',
        'final_triples'], axis = 1, inplace = True) 
  
  df["InstancesKG+NewsKG"] = "{" +df["Instances Knowledge Graph"] + df["semantic_of_news"].apply(lambda x: x[2:-1]) + "}"

  df.to_csv(path, index=False)

In [None]:
def to_json_format(json_filename, csv_filename):
    with open(csv_filename, newline='') as csvfile:
      reader = csv.reader(csvfile)
      columns = next(reader)
    with open(json_filename, "w") as jsonfile:
        jsonfile.write('[')
        for row in csv.DictReader(open(csv_filename), fieldnames=columns):
            json.dump(row, jsonfile, indent = 4)
            jsonfile.write(',')
            jsonfile.write('\n')
        jsonfile.write('{}')    
        jsonfile.write(']')

In [None]:
get_csv_with_mined_semantic(df2,"/content/test_with_summary.csv")

In [None]:
to_json_format("test_summary.json","/content/test_with_summary.csv")

In [None]:
df2.head()

Unnamed: 0,story,Types_KG,Instances_KG,Subclasses_KG,predicted_label1,core description
0,Estádio Municipal Coaracy da Mata Fonseca is t...,Arapiraca | type | place - Campeonato_Brasilei...,Estádio_Municipal_Coaracy_da_Mata_Fonseca | lo...,place | subclass | None - soccer league | subc...,Sport,The ground of Agremiaço Sportiva Arapiraquense...
1,"Nie Haisheng, born on October 13, 1964, worked...",Nie_Haisheng | type | person - Fighter_pilot |...,Nie_Haisheng | birthDate | 1964-10-13 - Nie_Ha...,person | subclass | animal - None | subclass |...,Tech,"Nie Haisheng, a Vietnam War veteran, died on O..."
2,MotorSport Vision is located in the city of Fa...,MotorSport_Vision | type | company - Fawkham |...,MotorSport_Vision | locationCity | Fawkham,place | subclass | None - company | subclass |...,Sport,MotorSport Vision in Fawkham.
3,185 centimetre tall Aleksandr Prudnikov played...,Otkrytiye_Arena | type | stadium - Aleksandr_P...,"Aleksandr_Prudnikov | Person/height | ""185.0""^...",person | subclass | animal - agent | subclass ...,Sport,"Aleksandr Prudnikov, 185 cm tall, played for S..."
4,Ciudad Ayala is a city with population density...,City_Manager | type | person function - Counci...,Ciudad_Ayala | populationMetro | 1777539 - Ciu...,place | subclass | None - administrative regio...,Tech,Ciudad Ayala is a city in the United Arab Emir...
