In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/news_analysis_summary_combined_dedup.xlsx')

In [None]:
import time
import google.generativeai as genai

In [None]:
# Used to securely store your API key
from google.colab import userdata

In [None]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

In [None]:
model = genai.GenerativeModel('gemini-pro')

In [None]:
def summarize_with_api(text, model):
  """
  Summarizes the text using the provided model (placeholder).

  **Note:** This is a basic example and might require adjustments based on the specific model implementation.

  Args:
      text (str): The text to summarize.
      model (object): The model object used for summarization.

  Returns:
      str: The summarized text or None if an error occurs.
  """
  output_text = ""
  chunks = split_into_chunks(text)
  print(len(chunks))
  idx = 0
  for chunk in chunks:
    retries = 0
    idx += 1
    while retries < 3:  # Set a maximum number of retries
      try:
        print(f"chunk {idx} starting")
        response = model.generate_content("Please summarize the text: " + chunk)
        output_text += response.text
        output_text += " "
        print(f"chunk {idx} completed")
        time.sleep(5)
        retries += 3
      except Exception as ex:
        print(f"Error occurred during summarization: {ex}")
        retries += 1
        time.sleep(2**retries)  # Exponential backoff for retries

    #print(f"Failed to summarize text after {retries} retries.")
    #return None
  return output_text.strip()

def split_into_chunks(text, max_tokens=12288):
    # Function to split the text into chunks based on max_tokens
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 > max_tokens:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word)
        else:
            current_chunk.append(word)
            current_length += len(word) + 1  # +1 for the space

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

In [None]:
all_indices = list(range(len(df)))
keys = list(df.keys())
#completed_indices = [int(ele) for ele in keys]
#remaining_indices = list(set(all_indices) - set(completed_indices) )
remaining_indices = list(set(all_indices) - set(keys))

In [None]:
output = {}

In [None]:
batch_size = 200
start_index = 0

for end_index in range(start_index + batch_size, len(remaining_indices) + 1, batch_size):
  batch_indices = remaining_indices[start_index:end_index]
  batch_df = pd.DataFrame(columns=['article_text', 'Summary'])  # Create empty dataframe

  for idx in batch_indices:
    text = df.iloc[idx]['article_text']
    time.sleep(5)  # This might not be necessary depending on the API rate limits
    res = summarize_with_api(text, model)
    if res:
      # Assuming 'article_text' is available and summary can be added as a new column
      batch_df.loc[len(batch_df)] = [text, res]  # Append data as a new row
      print(f'{idx} completed')
    else:
      print(f'{idx} failed')
      continue

  # Update main output dictionary and save batch output as dataframe
  output.update({str(idx): res for idx, res in batch_df['Summary'].items()})  #
  batch_filename = f"batch_{start_index}_{end_index}.xlsx"
  batch_df.to_excel(batch_filename, index=False)  # Save dataframe to file

  start_index = end_index
df.to_excel('/content/drive/MyDrive/Colab_Notebooks/news_summary.xlsx', index=False)