# Extract Key Information

In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Load Data

In [2]:
import pandas as pd

df_orig = pd.read_csv("bbc-news-data.csv", delimiter='\t', index_col=False)

In [3]:
df = df_orig.copy()
df

Unnamed: 0,category,filename,title,content
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2241,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2242,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2243,tech,399.txt,Be careful how you code,A new European directive could put software w...
2244,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...


## Create Prompt

In [4]:
prompt_prefix = """ 
  Extract key information from this text
"""

prompt = prompt_prefix + df['title'].loc[0] + "\n" + df['content'].loc[0]
print(prompt)

 
  Extract key information from this text
Ad sales boost Time Warner profit
 Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.  The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.  Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner int

## Putting The Codes Together

In [14]:
import pandas as pd
import requests
import numpy as np
from transformers import AutoTokenizer
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

API_URL = "https://distil-eng-tsisodia.apps.gm0e690i8ebff95b7a.eastus.aroapp.io/v2/models/distil-eng/infer"

def extract_embeddings_via_api(text_content):

    inputs = tokenizer(text_content, return_tensors='pt', truncation=True, padding='max_length', max_length=512)

    input_ids = inputs['input_ids'].tolist()
    attention_mask = inputs['attention_mask'].tolist()

    payload = {
        "inputs": [
            {
                "name": "input_ids",
                "shape": [1, len(input_ids[0])],
                "datatype": "INT64",
                "data": input_ids[0]
            },
            {
                "name": "attention_mask",
                "shape": [1, len(attention_mask[0])],
                "datatype": "INT64",
                "data": attention_mask[0]
            }
        ]
    }

    try:
        response = requests.post(API_URL, json=payload, timeout=30)

        if response.status_code == 200:

            response_data = response.json()

            embeddings = response_data['outputs'][0]['data']

            embeddings = np.array(embeddings).reshape(1, -1)
            
            return embeddings
        else:
            print(f"Error: {response.status_code} - {response.text}")
            return None
    except requests.exceptions.RequestException as e:
        print(f"Error during API call: {e}")
        return None

def split_into_sentences(text):
    sentences = text.split('. ')
    return [sentence.strip() for sentence in sentences if sentence]  # Remove extra spaces

# Function to extract key sentences based on cosine similarity with document embedding
def extract_key_sentences_via_api(title, content, n_sentences=2):
    # Combine title and content
    full_text = title + "\n" + content
    
    # Split the full text into sentences
    sentences = split_into_sentences(full_text)
    
    # Generate the embeddings for the entire document via API
    doc_embedding = extract_embeddings_via_api(full_text)
    if doc_embedding is None:
        return "Error in generating document embeddings."
    
    # Generate embeddings for each sentence via API
    sentence_embeddings = []
    for sentence in sentences:
        sentence_embedding = extract_embeddings_via_api(sentence)
        if sentence_embedding is not None:
            sentence_embeddings.append(sentence_embedding)
        else:
            sentence_embeddings.append(np.zeros_like(doc_embedding))
    
    # Calculate cosine similarity between document embedding and each sentence embedding
    similarities = [cosine_similarity(doc_embedding, sentence_emb).flatten()[0] for sentence_emb in sentence_embeddings]
    
    # Sort the sentences by their similarity score
    ranked_sentences = [sent for _, sent in sorted(zip(similarities, sentences), reverse=True)]
    
    # Return the top `n_sentences` most relevant sentences
    return " ".join(ranked_sentences[:n_sentences])

# Set the column name and initialize the results DataFrame
colname = 'key_info'
results = pd.DataFrame(columns=[colname], index=df.index)

# Limit to 20 entries
df_limited = df.head(20)

# Process each document to extract key information for the first 20 entries
for idx, title, content in zip(df_limited.index.values, df_limited['title'].loc[df_limited.index.values], df_limited['content'].loc[df_limited.index.values]):
  
    try:
        # Extract key sentences
        key_info = extract_key_sentences_via_api(title, content, n_sentences=2)
        
        # Store the key information in the results DataFrame
        results[colname].loc[idx] = key_info
        
        print(f"Processed index {idx}: Key information extracted")
    
    except Exception as err:
        print(f"Unexpected error at index {idx}: {err=}, {type(err)=}")

print(results.head())



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m




Processed index 0: Key information extracted
Processed index 1: Key information extracted
Processed index 2: Key information extracted
Processed index 3: Key information extracted
Processed index 4: Key information extracted
Processed index 5: Key information extracted
Processed index 6: Key information extracted
Processed index 7: Key information extracted
Processed index 8: Key information extracted
Processed index 9: Key information extracted
Processed index 10: Key information extracted
Processed index 11: Key information extracted
Processed index 12: Key information extracted
Processed index 13: Key information extracted
Processed index 14: Key information extracted
Processed index 15: Key information extracted
Processed index 16: Key information extracted
Processed index 17: Key information extracted
Processed index 18: Key information extracted
Processed index 19: Key information extracted
Key information extraction complete. Results saved to key_information.csv.


## Results

In [15]:
results

Unnamed: 0,key_info
0,Ad sales boost Time Warner profit\n Quarterly ...
1,Dollar gains on Greenspan speech\n The dollar ...
2,"""The pledged assets are with Rosneft, so it wi..."
3,Yet aviation analyst Mike Powell of Dresdner K...
4,Pernod takeover talk lifts Domecq\n Shares in ...
...,...
2241,
2242,
2243,
2244,


### Add results to dataframe

In [16]:
df_results = pd.concat([df, results], axis=1)
df_results.shape
df_results

(2246, 5)

Unnamed: 0,category,filename,title,content,key_info
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,Ad sales boost Time Warner profit\n Quarterly ...
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,Dollar gains on Greenspan speech\n The dollar ...
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,"""The pledged assets are with Rosneft, so it wi..."
3,business,004.txt,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...,Yet aviation analyst Mike Powell of Dresdner K...
4,business,005.txt,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...,Pernod takeover talk lifts Domecq\n Shares in ...
...,...,...,...,...,...
2241,tech,397.txt,BT program to beat dialler scams,BT is introducing two initiatives to help bea...,
2242,tech,398.txt,Spam e-mails tempt net shoppers,Computer users across the world continue to i...,
2243,tech,399.txt,Be careful how you code,A new European directive could put software w...,
2244,tech,400.txt,US cyber security chief resigns,The man making sure US computer networks are ...,


## Save Results

In [17]:
fname = 'key_info.csv'
df_results.to_csv(fname, sep='\t')