In [1]:
!pip install tira

[0m

In [2]:
from tira.rest_api_client import Client

tira = Client()
df_val = tira.pd.inputs(
    "nlpbuw-fsu-sose-24", "summarization-validation-20240530-training"
).set_index("id")

In [4]:
df_train = tira.pd.inputs(
    "nlpbuw-fsu-sose-24", "summarization-train-20240530-training"
).set_index("id")

In [5]:
print(df_train)

                                                   story
id                                                      
9113   (CNN) -- Friday marks the passage of one month...
86209  Severe storms tore through the Midwest and Sou...
14346  (CNN) -- High ranking officials in North Korea...
79198  Tuscaloosa, Alabama (CNN) -- Dazed Southerners...
23703  CHICAGO, Illinois (CNN) -- A man suspected of ...
...                                                  ...
90872  (CNN)Latest developments:\n• Transponder data ...
87221  Two international crises are giving Republican...
46907  (CNN) -- "To report the state of the union."\n...
12763  Phoenix, Arizona (CNN) -- The legal battle ove...
68280  Washington (CNN) -- Prosecutors hope to determ...

[5000 rows x 1 columns]


In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

# Download stopwords if not already done
nltk.download('stopwords')
nltk.download('punkt')

# Define a function to clean and preprocess the text
def preprocess_text(text):
    # Remove unwanted characters and convert to lowercase
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\n', ' ', text)  # Remove newlines
    text = text.lower()
    
    # Tokenize text
    sentences = sent_tokenize(text)
    words = [word_tokenize(sentence) for sentence in sentences]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [[word for word in sentence if word.isalnum() and word not in stop_words] for sentence in words]
    
    # Reconstruct the cleaned text
    cleaned_text = ' '.join([' '.join(sentence) for sentence in words])
    
    return cleaned_text



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Apply preprocessing
df_train['cleaned_story'] = df_train['story'].apply(preprocess_text)
df_val['cleaned_story'] = df_val['story'].apply(preprocess_text)

In [11]:
from transformers import pipeline
from tqdm import tqdm

# Load summarization pipeline with BART
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Function to summarize text
def summarize_texts(texts):
    summaries = []
    for text in tqdm(texts, desc="Summarizing texts"):
        # The max_length and min_length can be adjusted based on your requirements
        summary = summarizer(text, max_length=150, min_length=40, do_sample=False)
        summaries.append(summary[0]['summary_text'])
    return summaries

In [12]:
# Apply summarization
df_val['summary'] = df_val['cleaned_story'].apply(summarize_text)

# Display the dataframe with summaries
print(df_val[['story', 'summary']].head())

: 