In [36]:
import json
import pandas as pd
import os
from dotenv import load_dotenv
from sentence_similarity import SentenceSimilarity

from datetime import datetime, timezone, timedelta
load_dotenv()

date = datetime.now(timezone.utc).strftime("%Y-%m-%d")
yesterday = (datetime.now(timezone.utc) - timedelta(days=1)).strftime("%Y-%m-%d")

# with open(f'../data/newsdataio/sentiment/{date}.json', 'r') as f:
with open(f'../data/newsdataio/filtered/{yesterday}.json', 'r') as f:
    data = json.load(f)

In [37]:
# do sentence similarity
sentence_similarity = SentenceSimilarity(data)
sentence_similarity.run()
similar_articles = getattr(sentence_similarity, '_SentenceSimilarity__similar_articles')

In [38]:
df = pd.DataFrame(data)

In [39]:
# sort similar articles list in descending order
similar_articles.sort(key=lambda x: len(x), reverse=True)

In [40]:
print(df.shape[0])

288


In [44]:
from copy import deepcopy
analyzed_articles = deepcopy(similar_articles)
len(analyzed_articles)

92

In [50]:
similar_articles = deepcopy(analyzed_articles)
len(similar_articles)

92

In [51]:
for i in range(len(similar_articles)):
    for j in range(i+1, len(similar_articles)):
        i_articles = set(similar_articles[i])
        j_articles = set(similar_articles[j])

        if not j_articles:
            continue

        if i_articles.intersection(j_articles):
            similar_articles[i] += similar_articles[j]
            similar_articles[j] = []
            print("Combined {} and {}".format(i, j))

new_art = [x for x in similar_articles if x]
print("Number of categories: {}".format(len(new_art)))

Combined 0 and 1
Combined 0 and 2
Combined 0 and 3
Combined 0 and 5
Combined 0 and 12
Number of categories: 87


In [52]:
print(new_art)

[[9, 21, 30, 31, 43, 46, 50, 60, 62, 112, 116, 124, 133, 140, 149, 150, 162, 165, 169, 179, 181, 231, 235, 243, 252, 253, 265, 268, 272, 282, 284, 0, 30, 43, 60, 62, 103, 109, 112, 149, 162, 179, 181, 217, 222, 228, 231, 252, 265, 282, 284, 98, 21, 46, 60, 112, 116, 119, 123, 140, 165, 179, 231, 235, 236, 238, 242, 243, 268, 282, 117, 43, 46, 50, 60, 158, 162, 165, 169, 179, 261, 265, 268, 272, 282, 39, 9, 31, 133, 146, 150, 249, 253, 27, 119, 207, 238, 88], [6, 25, 33, 77, 125, 130, 144, 152, 196, 247, 255, 1], [28, 65, 147, 184, 250, 287, 16], [34, 142, 153, 245, 256, 23], [42, 157, 161, 260, 264, 38], [59, 159, 178, 262, 281, 40], [64, 180, 183, 283, 286, 61], [102, 141, 221, 244, 22], [121, 210, 240, 91], [106, 218, 225, 99], [143, 246, 24], [145, 248, 26], [148, 251, 29], [151, 254, 32], [154, 257, 35], [155, 258, 36], [156, 259, 37], [160, 263, 41], [163, 266, 44], [164, 267, 45], [166, 269, 47], [167, 270, 48], [168, 271, 49], [170, 273, 51], [171, 274, 52], [172, 275, 53], [173

In [64]:
articles_to_check = new_art[10]
# for all elements in articles_to_check print the title from df as row items
pd.DataFrame([df.iloc[i] for i in articles_to_check]).title

143    GST 2.0 Will Further Ease Tax Compliances And ...
246    GST 2.0 Will Further Ease Tax Compliances And ...
24     GST 2.0 Will Further Ease Tax Compliances And ...
Name: title, dtype: object

In [1]:
from huggingface_hub import InferenceClient
import json
import os
from dotenv import load_dotenv
load_dotenv()

DEFAULT_MODEL = "google/gemma-2-2b-it"
MODEL_TEMPERATURE = 0.3
MAX_TOKENS = 4096
TOP_P = 0.3

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
client = InferenceClient(api_key=os.getenv('HUGGINGFACE_READ_API_KEY'))

In [3]:
titles = [
    "Rise of First Indian Defence Dragon: Big Bang Boom Solutions Raises Rs 250 Crore with a 15x Return to Early Investors",
    "Sensex, Nifty touch all-time closing high levels amid strong foreign fund inflows",
    "Yashasvi Jaiswal Eyes Record Breaking Feat As India Gears Up For 2nd Test Against Bangladesh"
]

In [4]:
input = f"""
Using the rule of three, convert these three headline into catchy three word titles.
Do not use the same word in any of the titles.

1. {titles[0]}
2. {titles[1]}
3. {titles[2]}

Return the response as a JSON dictionary.
The JSON dictionary should have only one output entry.
The dictionary output should only have three keys as 'one', 'two' and 'three'.
Each key should have the summary of the corresponding headline.
"""

In [5]:
messages = [{'role': 'user', 'content': input}]

In [9]:
output = client.chat.completions.create(
    model = DEFAULT_MODEL,
    messages = messages,
    temperature=MODEL_TEMPERATURE,
    max_tokens=MAX_TOKENS,
    top_p=TOP_P
)

In [14]:
print(output)
print(type(output))

ChatCompletionOutput(choices=[ChatCompletionOutputComplete(finish_reason='stop', index=0, message=ChatCompletionOutputMessage(role='assistant', content='```json\n{\n  "one": "Indian Defence Boom",\n  "two": "Market Hits Record",\n  "three": "Jaiswal Sets Record"\n}\n``` \n', tool_calls=None), logprobs=None)], created=1727205452, id='', model='google/gemma-2-2b-it', system_fingerprint='2.2.1-dev0-sha-e415b69', usage=ChatCompletionOutputUsage(completion_tokens=43, prompt_tokens=165, total_tokens=208))
<class 'huggingface_hub.inference._generated.types.chat_completion.ChatCompletionOutput'>


In [17]:
# extract the content of the assistant's response
json_output = output['choices'][0]['message']['content']
print(json_output)

```json
{
  "one": "Indian Defence Boom",
  "two": "Market Hits Record",
  "three": "Jaiswal Sets Record"
}
``` 



In [19]:
# convert this json output into list of summaries
json_output = json_output.replace('`', '')
json_output = json_output.replace('json', '')
print(json_output)


{
  "one": "Indian Defence Boom",
  "two": "Market Hits Record",
  "three": "Jaiswal Sets Record"
}
 



In [20]:
json_output = json.loads(json_output)
summaries = [json_output[key] for key in json_output]
print(summaries)

['Indian Defence Boom', 'Market Hits Record', 'Jaiswal Sets Record']


In [21]:
# concatenate the summary as a single string as {one}, {two} and {three}
summary = summaries[0] + ', ' + summaries[1] + ', and ' + summaries[2]
print(summary)

Indian Defence Boom, Market Hits Record, and Jaiswal Sets Record
