In [1]:
import nltk
from gensim.summarization import summarize

nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

def preprocess_text(text):
    sentences = sent_tokenize(text)
    words = [word for word in word_tokenize(text.lower()) if word.isalnum()]
    words = [word for word in words if word not in stopwords.words('english')]
    return sentences, ' '.join(words)

def generate_summary(text):
    sentences, cleaned_text = preprocess_text(text)
    summary = summarize(cleaned_text, ratio=0.2)  # Adjust the ratio as needed
    return summary



ModuleNotFoundError: No module named 'gensim.summarization'

In [8]:
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load pre-trained model and tokenizer
model_name = "sentence-transformers/distilbert-base-nli-stsb-mean-tokens"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Define your list of texts
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "A stitch in time saves nine.",
    "An apple a day keeps the doctor away."
]

# Encode the texts into embeddings
encoded_texts = [tokenizer(text, return_tensors='pt', padding=True, truncation=True) for text in texts]
embeddings = [model(**encoded_text).last_hidden_state.mean(dim=1).detach().numpy() for encoded_text in encoded_texts]

# Query text
query = "lifestyle"

# Encode the query and calculate similarity scores
encoded_query = tokenizer(query, return_tensors='pt', padding=True, truncation=True)
query_embedding = model(**encoded_query).last_hidden_state.mean(dim=1).detach().numpy()
similarities = [cosine_similarity(query_embedding, emb)[0][0] for emb in embeddings]

# Print results
for text, sim in zip(texts, similarities):
    print(f"Text: {text}\nSimilarity: {sim:.4f}\n")

Text: The quick brown fox jumps over the lazy dog.
Similarity: -0.0287

Text: A stitch in time saves nine.
Similarity: -0.0056

Text: An apple a day keeps the doctor away.
Similarity: 0.0442



In [5]:
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.qparser import QueryParser

# Define schema for the index
schema = Schema(id=ID(unique=True, stored=True), content=TEXT(stored=True))

# Create an index or open an existing one
index_dir = "index_dir"
ix = create_in(index_dir, schema)
#if not index_dir.exists():
#    index_dir.mkdir()
#    ix = create_in(index_dir, schema)
#else:
#ix = open_dir(index_dir)

# Create or open an index writer
writer = ix.writer()

# Add documents to the index
documents = [
    {"id": "doc1", "content": "The quick brown fox jumps over the lazy dog."},
    {"id": "doc2", "content": "A stitch in time saves nine."},
    {"id": "doc3", "content": "An apple a day keeps the doctor away."}
]

for doc in documents:
    writer.add_document(**doc)

writer.commit()

# Perform full-text search
search_query = "doctor"
with ix.searcher() as searcher:
    query_parser = QueryParser("content", ix.schema)
    query = query_parser.parse(search_query)
    results = searcher.search(query)

    for result in results:
        print(f"Document ID: {result['id']}")

# Clean up resources
ix.close()


Document ID: doc3


In [5]:
from transformers import pipeline

# Load the summarization pipeline
summarizer = pipeline("summarization")

# Input text
input_text = """
It’s the summer of international travel for Americans.

After three years stuck close to home, people are heading to Europe and the Pacific in droves. And they are spending more money when they get there.


The three major US international airlines — American (AAL), Delta (DAL) and United — have all seen a surge in international traffic in recent months and are adding additional service to meet demand.

Travelers walk through Denver International Airport on July 30.
Travelers walk through Denver International Airport on July 30.
Daniel Slim/AFP/Getty Images
Travel from the United States has more than doubled to destinations across the Pacific, including China, where travelers faced severe Covid-related restrictions a year ago, airlines reported. Trans-Atlantic demand for flights is also up.

And the surge in international travel at those three major carriers all came with no appreciable drop-off in domestic traffic or fares.

The top destinations
The strong demand is lifting international fares.

Airfare to Europe this summer is averaging nearly $1,200 per ticket, the highest prices in the last six years, according to Hopper, a travel booking app. Flights to Europe are costing 12% more than last summer, and 23% more than in summer 2019.

Where are people going? London, Paris, Rome and Dublin are the top European destinations for US travelers this summer, according to Hopper.

People enjoy the sunset on a bank of the River Seine, in Paris, France, in June.
People enjoy the sunset on a bank of the River Seine, in Paris, France, in June.
Stephanie Lecocq/Reuters
American said passenger revenue on trans-Atlantic route jumped 45% in the first half of this year, and more than tripled on trans-Pacific routes.

United said miles traveled by paying passengers jumped 23% on trans-Atlantic routes in the second quarter and 172% on trans-Pacific routes.

The gains were even larger at Delta, where trans-Atlantic miles flown by passengers jumped 56% in the first half of the year, and the average amount they paid for each of those miles rose 24%.

Overseas travel is also lifting credit-card companies.

Spending higher than before Covid
At Mastercard, spending linked to overseas travel is currently at 154% of pre-pandemic levels, the company said this week.

Mastercard cited “resilient consumer spending, particularly in travel and experiences.”

Domestic travelers aren’t seeing the same spike in fares. But the three major carriers’ domestic flights are still packed and most measures of domestic fares are still higher than pre-pandemic levels.

CNN’s Chris Isidore contributed to this article.
"""

# Generate abstractive summary
summary = summarizer(input_text, max_length=100, min_length=20, do_sample=True)[0]['summary_text']
print(summary)

No model was supplied, defaulted to sshleifer/distilbart-cnn-12-6 and revision a4f8f3e (https://huggingface.co/sshleifer/distilbart-cnn-12-6).
Using a pipeline without specifying a model name and revision in production is not recommended.


 Travel from the United States has more than doubled to destinations across the Pacific, including China . London, Paris, Rome and Dublin are the top European destinations for US travelers this summer . Airfare to Europe this summer is averaging nearly $1,200 per ticket, the highest prices in the last six years .


In [None]:
input_text = "Your input text goes here..."
summary = generate_summary(input_text)
print(summary)