Task Description for Experiment: 
* Given the source_text of sentences for the LOGIC dataset, 
  extract keywords. 
* Keywords can be extracted from spaCy. 

In [16]:
#!pip install -U pip setuptools wheel
#!pip install -U spacy
#!python -m spacy download en_core_web_trf 
#!pip install git+https://github.com/LIAAD/yake
!pip install keybert 




In [1]:
import spacy
import pandas as pd 
import re 
import yake 
from keybert import KeyBERT


In [3]:
nlp = spacy.load('en_core_web_lg')
yake_kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)
keybert_kw_extractor = KeyBERT(model='all-mpnet-base-v2')


In [18]:
def extract_keywords_spacy(sentence): 

  doc = nlp(sentence)
  entities = list(doc.ents) 
  return entities

def extract_keywords_yake(sentence): 
  #yake_kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)
  keywords = yake_kw_extractor.extract_keywords(sentence)
  return keywords 

def extract_keywords_keybert(sentence): 
  #keybert_kw_extractor = KeyBERT(model='all-mpnet-base-v2')

  keywords = keybert_kw_extractor.extract_keywords(sentence, keyphrase_ngram_range=(1,3), stop_words = 'english', highlight = False, top_n=3)
  keywords_list = list(dict(keywords).keys()) 
  return keywords_list 
  
def get_keywords_list(data, field, type): 
  sentences = data[field].to_list() 
  if type=='spacy': 
    keywords = [ extract_keywords_spacy(sentence) for sentence in sentences] 
  return keywords 

def get_stats_keywords(data): 
  empty_data = 0 
  full_data = 0
  sum_data = 0 
  for elt in data: 
    #print(len(elt))
    if len(elt) <1: 
      empty_data +=1 
    else: 
      full_data+=1 
    
    sum_data+=len(elt) 
  return empty_data,full_data, round(empty_data/(empty_data+full_data), 2), round(full_data/(empty_data+full_data), 2), round(sum_data/len(data), 2) 
    

False Dilemma Prompt Extraction

In [4]:
def extract_prompts(sentence, label): 
  '''
  Params: 
  1. sentence - sentence to parse 
  2. label - type of fallacy ( since the format changes from fallacy type to type)
  '''
  if label == 'false dilemma': 
    index_choice2 = sentence.find('Choice 2:')
    choice2 = sentence[index_choice2+len('Choice 2:'):] 
    #print(choice2)
    index_choice1 = sentence.find('Choice 1:') 
    choice1 = sentence[index_choice1+len('Choice 1:'):index_choice2] 
    #print(choice1)
    return choice1 , choice2
  
  

In [5]:
dataset = pd.read_csv('../data/pure_classes/false dilemma_train.csv') 
#dataset = dataset.drop('text_generated', axis=1)

choice1_list, choice2_list = [], [] 
sentences = dataset['clean_prompt']
for sentence in sentences: 
  choice1, choice2 = extract_prompts(sentence, 'false dilemma') 
  choice1_list.append(choice1) 
  choice2_list.append(choice2) 

dataset['choice 1'] = choice1_list 
dataset['choice 2'] = choice2_list 

dataset.to_csv('../data/pure_classes/false dilemma_train.csv', index = False)

In [6]:
dataset['choice 1'][0:5]

0                false dilemma 
1     candy can cause cavities 
2                 force people 
3                “you love me” 
4               rich kid thing 
Name: choice 1, dtype: object

In [11]:
dataset['choice 2'][5]

' get stuck with cable '

In [13]:
extract_keywords_yake(dataset['choice 2'][3])

[('academic decathlon', 0.04940384002065631),
 ('decathlon', 0.15831692877998726),
 ('academic', 0.29736558256021506)]

In [14]:
dataset['choice_1_spacy'] = [ extract_keywords_spacy(choice1) for choice1 in dataset['choice 1'].to_list()]   
dataset['choice_2_spacy'] = [ extract_keywords_spacy(choice2) for choice2 in dataset['choice 2'].to_list()]  
dataset['choice_1_yake'] = [ extract_keywords_yake(choice1) for choice1 in dataset['choice 1'].to_list()]   
dataset['choice_2_yake'] = [ extract_keywords_yake(choice2) for choice2 in dataset['choice 2'].to_list()]  
dataset['choice_1_keybert'] = [ extract_keywords_keybert(choice1) for choice1 in dataset['choice 1'].to_list()]  
dataset['choice_2_keybert'] = [ extract_keywords_keybert(choice2) for choice2 in dataset['choice 2'].to_list()]  

In [15]:
dataset.to_csv('../data/pure_classes/false dilemma_keywords.csv', index=False) 


Statistics for spaCy 

In [19]:
empty, full, empty_percentage, full_percentage, avg_keywords = [], [], [], [], [] 
rows = ['choice_1_spacy', 'choice_2_spacy', 'choice_1_yake', 'choice_2_yake', 'choice_1_keybert', 'choice_2_keybert']

for row in rows: 
  stats = get_stats_keywords(dataset[row].to_list()) 
  empty.append(stats[0]) 
  full.append(stats[1]) 
  empty_percentage.append(stats[2]) 
  full_percentage.append(stats[3]) 
  avg_keywords.append(stats[4]) 
statistics_keywords = {'empty_data':empty, 'full_data': full, 'empty_data_percentage': empty_percentage, 'full_data_percentage': full_percentage, 'average_num_keywords_per_sentence_extracted': avg_keywords} 
stats = pd.DataFrame.from_dict(statistics_keywords) 
stats.to_csv('../results/performance_of_keyword_extractors.csv')


