--------------
## Exercise 1


In [1]:
import re

class TextManipulator:
    def __init__(self, text):
        self.text = text

    def remove_last_20_chars(self):
        return self.text[:-20]

    def remove_url_prefix(self):
        url_start = self.text.find('https://')
        return self.text[:url_start]

    def remove_urls(self):
        url_regex = r'https?://\S+\.\S+'
        return re.sub(url_regex, '', self.text)

# Input text
text = "The link to latest football score. https://xyz.com/a/b"

# Create an instance of TextManipulator
text_manipulator = TextManipulator(text)

# Remove last 20 characters (basic)
output1 = text_manipulator.remove_last_20_chars()
print(f'Output #1: {output1}')

# Remove URL prefix (prefix)
output2 = text_manipulator.remove_url_prefix()
print(f'Output #2: {output2}')

# Remove URLs using regex (ready to serve method)
output3 = text_manipulator.remove_urls()
print(f'Output #3: {output3}')


Output #1: The link to latest football score.
Output #2: The link to latest football score. 
Output #3: The link to latest football score. 


In [3]:
import re

def remove_words_with_regex(text, regex = r'', doSplit = False):
    if doSplit:
        text = re.findall(regex, text)
    return re.sub(regex, '', text)

regex = r'\b\w*\d\w*\b'
# Input text
text = "Hello Maria whatsup123"

# Call the function to remove words with digits
output = remove_words_with_regex(text,regex)

# Print the output
print(output)

Hello Maria 


In [4]:
import re

text = "Mado is very good with last ball six #dhoni #six"
regex = r'\s*#\w+\s'
output = remove_words_with_regex(text,regex)
print(output)

Mado is very good with last ball six#six


In [4]:
import re


regex = r'(\d+|\D+)'
text = "I will be buying movie tickets for 4adults"
splitted = re.findall(regex, text)
output = ' '.join(splitted)
print(output)

I will be buying movie tickets for  4 adults


## Exercise 2 

In [8]:
import pandas as pd
df = pd.read_csv('data/text.csv')

In [9]:
df.sample(20)

Unnamed: 0,text,labels
1666,O'Sullivan keeps his powder dry\n\nWhen you ar...,sport
1870,Millions buy MP3 players in US\n\nOne in 10 ad...,tech
1527,Zambia confident and cautious\n\nZambia's tech...,sport
675,TV station refuses adoption show\n\nA TV stati...,entertainment
2081,EU software patent law delayed\n\nControversia...,tech
410,Tsunami cost hits Jakarta shares\n\nThe stock ...,business
1451,Chelsea denied by James heroics\n\nA brave def...,sport
616,Manics in charge of BBC 6 Music\n\nThe Manic S...,entertainment
1837,EU software patent law faces axe\n\nThe Europe...,tech
1571,Benitez deflects blame from Dudek\n\nLiverpool...,sport


In [10]:
df.labels.value_counts()

labels
sport            511
business         510
politics         417
tech             401
entertainment    386
Name: count, dtype: int64

In [11]:
summarize_samples = df.sample(3)
texts = summarize_samples.text.to_list()
labels = summarize_samples.labels.to_list(),

In [9]:
texts

["Can Smith work Scottish wonders?\n\nThe worst kept secret in Scottish football was revealed on Thursday when Walter Smith was named as the new national manager.\n\nFrom the moment Berti Vogts' miserable tenure in charge of Scotland ended, the former Rangers and Everton boss has been the overwhelming favourite for the post. But is Smith the man for what must be one of the hardest jobs in football? The 56-year-old takes over at a time when the national side is in the doldrums. Scotland have not reached a major finals since the World Cup in 1998 and reaching Germany 2006 looks near impossible, having picked up just two points from the opening three games in the qualifying race. And the Fifa rankings see Scotland listed at an all time low of 77th, below the likes of Estonia, Ghana, Angola and Thailand. Scotland are not blessed with quality players with experience at the top level, so Smith will have to get the best out of meagre resources. Smith's track record make impressive reading and

### Model

In [10]:
!pip install transformers



In [11]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("summarization", model="facebook/bart-large-cnn")

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from datetime import datetime

def evaluate_time_and_summ(texts):
    summs = []
    start_time = datetime.now()  # Record the start time
    for text in texts:
        # Assuming pipe() is a function that returns a summary
        summary = pipe(text, max_length=100, min_length=20)
        summs.append(summary)
        end_time = datetime.now()  # Record the end time
        summs.append(end_time - start_time)

    

    return summs


In [13]:
summary = evaluate_time_and_summ(texts)

In [14]:
summary

[[{'summary_text': 'Walter Smith has been named as the new manager of Scotland. The 56-year-old replaces Berti Vogts, who was sacked on Wednesday. Scotland have not reached a major finals since the World Cup in 1998.'}],
 datetime.timedelta(seconds=23, microseconds=389042),
 [{'summary_text': 'Kostas Kenteris and Katerina Thanou have been cleared of doping offences. The duo had been provisionally suspended by the IAAF for allegedly missing three drugs tests, including one on the eve of the Athens Olympics. But the Greek Athletics Federation tribunal has overturned the bans. The IAAF can now contest the decision at the Court of Arbitration for Sport.'}],
 datetime.timedelta(seconds=37, microseconds=377484),
 [{'summary_text': "Alfa-Eco, the venture capital arm of Russian conglomerate Alfa Group, has a one-fifth stake in Sun Interbrew. The deal gives Inbev, the world's biggest beermaker, near-total control over the Russian brewer."}],
 datetime.timedelta(seconds=46, microseconds=668003)]

### Exercise 3

In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_path = './model'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
from transformers import pipeline

classifier = pipeline('zero-shot-classification')

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [12]:
results = []
for text in texts:
    results.append(classifier(text, candidate_labels = labels))

In [13]:
for text, label in zip(results, labels):
    pred_label = text['labels'][0]    
    print(f'Pred : {pred_label} | Actualy: {label}')

Pred : ['politics', 'sport', 'politics'] | Actualy: ['politics', 'sport', 'politics']


In [1]:
with open('data/summary-1-flan-ul2--article1.txt', 'r') as file:
    reference_summary = file.read()
with open('data/summary-2-flan-ul2--article1.txt', 'r') as file:
    candidate_summary = file.read()
print(reference_summary)
print(candidate_summary)

People are using AI chatbots to fill junk websites with AI-generated text that attracts paying advertisers, according to a new report from the media research organization NewsGuard that was shared exclusively with MIT Technology Review. Over 140 major brands are paying for ads that end up on unreliable AI-written sites, likely without their knowledge. Ninety percent of the ads from major brands found on these AI-generated news sites were served by Google, though the company’s own policies prohibit sites from placing Google-served ads on pages that include “spammy automatically generated content.” The practice threatens to hasten the arrival of a glitchy, spammy internet that is overrun by AI-generated content, as well as wasting massive amounts of ad money.

A new report finds that sites run with AI-generated content serve ads from major brands, which mostly come from Google. Some of those sites contained dangerous misinformation. And this is just getting started. More could be on the 

In [15]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading click-8.1.7-py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, nltk
Successfully installed click-8.1.7 nltk-3.8.1


In [2]:
from nltk.translate.bleu_score import sentence_bleu

bleu = sentence_bleu(reference_summary, candidate_summary)
print(f'BLEU score {bleu}')

BLEU score 1.0025117266892697e-231


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
