# Extract Key Information

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load Data

In [2]:
import pandas as pd

df_orig = pd.read_csv("bbc-news-data.csv", delimiter='\t', index_col=False)

In [3]:
df = df_orig.copy()
df

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2241,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2242,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2243,tech,399.txt,Be careful how you code,A new European directive could put software w...
2244,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


## Create Prompt

In [4]:
prompt_prefix = """ 
  Extract keywords from this text
"""

prompt = prompt_prefix + df['title'].loc[0] + "\n" + df['content'].loc[0]
prompt

' \n  Extract keywords from this text\nAd sales boost Time Warner profit\n Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL\'s underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner inter

## Request to API

In [10]:
import pandas as pd
import requests
import numpy as np
from transformers import AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

API_URL = "https://distil-eng-tsisodia.apps.gm0e690i8ebff95b7a.eastus.aroapp.io/v2/models/distil-eng/infer"

def extract_embeddings_via_api(text_content):

    inputs = tokenizer(text_content, return_tensors='pt', truncation=True, padding='max_length', max_length=512)

    input_ids = inputs['input_ids'].tolist()
    attention_mask = inputs['attention_mask'].tolist()

    payload = {
        "inputs": [
            {
                "name": "input_ids",
                "shape": [1, len(input_ids[0])],
                "datatype": "INT64",
                "data": input_ids[0]
            },
            {
                "name": "attention_mask",
                "shape": [1, len(attention_mask[0])],
                "datatype": "INT64",
                "data": attention_mask[0]
            }
        ]
    }

    try:
        response = requests.post(API_URL, json=payload, timeout=30)

        if response.status_code == 200:
            response_data = response.json()

            embeddings = response_data['outputs'][0]['data']

            embeddings = np.array(embeddings).reshape(1, -1)
            
            return embeddings
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error during API call: {e}")
        return None

def split_into_keywords(text):
    words = text.split()
    return [word.strip() for word in words if word]

def extract_keywords_via_api(title, content, n_keywords=5):
    # Combine title and content
    full_text = title + "\n" + content
    
    # Split the full text into words/phrases
    words = split_into_keywords(full_text)
    
    # Generate the embeddings for the entire document via API
    doc_embedding = extract_embeddings_via_api(full_text)
    if doc_embedding is None:
        return "Error in generating document embeddings."
    
    # Generate embeddings for each word/phrase via API
    word_embeddings = []
    for word in words:
        word_embedding = extract_embeddings_via_api(word)
        if word_embedding is not None:
            word_embeddings.append(word_embedding)
        else:
            word_embeddings.append(np.zeros_like(doc_embedding))
    
    similarities = [cosine_similarity(doc_embedding, word_emb).flatten()[0] for word_emb in word_embeddings]

    ranked_words = [word for _, word in sorted(zip(similarities, words), reverse=True)]
    
    return ", ".join(ranked_words[:n_keywords])

colname = 'keywords'
results = pd.DataFrame(columns=[colname], index=df.index)

# Limit to 20 entries
df_limited = df.head(5)

for idx, title, content in zip(df_limited.index.values, df_limited['title'].loc[df_limited.index.values], df_limited['content'].loc[df_limited.index.values]):
    try:
        # Extract keywords
        keywords = extract_keywords_via_api(title, content, n_keywords=5)
        
        # Store the extracted keywords in the results DataFrame
        results[colname].loc[idx] = keywords
        
        print(f"Processed index {idx}: Keywords extracted")
    
    except Exception as err:
        print(f"Unexpected error at index {idx}: {err=}, {type(err)=}")

# Check the resulting DataFrame
print(results.head(5))




Processed index 0: Keywords extracted
Processed index 1: Keywords extracted
Processed index 2: Keywords extracted
Processed index 3: Keywords extracted
Processed index 4: Keywords extracted
                                        keywords
0     offset, $10.9bn., $42.09bn., sale, expects
1     late, foreign, concerns, concerns, Worries
2  Rosneft, Rosneft, Rosneft, Rosneft, embattled
3       latest, 8.1%, September, profits., extra
4          LVMH., ($10.7bn;, 8.2bn, 7.5bn, 1.8bn


## Results

In [11]:
results

Unnamed: 0,keywords
0,"offset, $10.9bn., $42.09bn., sale, expects"
1,"late, foreign, concerns, concerns, Worries"
2,"Rosneft, Rosneft, Rosneft, Rosneft, embattled"
3,"latest, 8.1%, September, profits., extra"
4,"LVMH., ($10.7bn;, 8.2bn, 7.5bn, 1.8bn"
...,...
2241,
2242,
2243,
2244,


## Add Results to DataFrame

In [12]:
df_results = pd.concat([df, results], axis=1)
df_results.shape
df_results

(2246, 5)

Unnamed: 0,category,filename,title,content,keywords
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,"offset, $10.9bn., $42.09bn., sale, expects"
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,"late, foreign, concerns, concerns, Worries"
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,"Rosneft, Rosneft, Rosneft, Rosneft, embattled"
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,"latest, 8.1%, September, profits., extra"
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,"LVMH., ($10.7bn;, 8.2bn, 7.5bn, 1.8bn"
...,...,...,...,...,...
2241,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...,
2242,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...,
2243,tech,399.txt,Be careful how you code,A new European directive could put software w...,
2244,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...,


## Save Results

In [13]:
fname = 'keywords.csv'
df_results.to_csv(fname, sep='\t')