In [3]:
!pip install transformers
!pip install torch
!pip install datsets transformers[sentencepiece]
!pip install sentencepiece

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m59.1 MB/s[0m eta [36m0:00:0

## **Installing Modules**

In [4]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

## **Model Initialization**

In [5]:
# Initializing T5 model - used t5 small varient
model = T5ForConditionalGeneration.from_pretrained('t5-small') ##-----T5 Model for text related tasks
tokenizer = T5Tokenizer.from_pretrained('t5-small',legacy=True) ##-------For tokenizing text, (specifically designed for T5)
device = torch.device('cpu')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

## **Input Text**

In [6]:
# Input Text
text = """
On 22 July 2019, ISRO launched Chandrayaan-2 on board a Launch Vehicle Mark-3 (LVM3) launch vehicle consisting of an orbiter, a lander and a rover.[15] The lander was scheduled to touch down on the lunar surface on 6 September 2019 to deploy the Pragyan rover. The lander ultimately crashed when it lost contact with earth (ISRO) and deviated from its intended trajectory while attempting to land near the lunar south pole.[16][17]

The lunar South Pole region holds particular interest for scientific exploration due to studies that show large amounts of ice there. Mountainous terrain and unpredictable lighting conditions not only protect the ice from melting, but also make landing scientific probes there a challenging undertaking. This ice could contain solid-state compounds that would normally melt under warmer conditions elsewhere on the Moon, compounds which could provide insight into lunar, Earth, and Solar System history. Ice could also be used as a source of drinking water and hydrogen for fuel and oxygen for future manned missions and outposts.[18][19]

The European Space Tracking network (ESTRACK), operated by the European Space Agency (ESA), is supporting the mission. Under a new cross-support arrangement, ESA tracking support could be provided for upcoming ISRO missions such as those of India's first human spaceflight programme, Gaganyaan, and the Aditya-L1 solar research mission. In return, future ESA missions will receive similar support from ISRO's own tracking stations.
"""

## **Pre-Processing**

In [7]:
# Preprocessed the text for T5 model input
preprocessed_text = text.strip().replace('\n','') ##---remove leading & trailing space, replace "\n" by ""
t5_input_text = 'summarize: '+ preprocessed_text ##----- adding prefix

In [8]:
# Length of text input
len(t5_input_text.split())

228

In [9]:
# Tokenization of input text
tokenized_text = tokenizer.encode(t5_input_text, return_tensors='pt', max_length=512).to(device) ##---returns Pytorch tensors

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


## **Generate Summary**

In [10]:
# Generating summary within 30-120 word limit
summary_ids = model.generate(tokenized_text, min_length=30, max_length=120)
# Decoding the generated summary back into human readable
summary = tokenizer.decode(summary_ids[0],skip_special_tokens=True) #----skipping special tokens

In [11]:
# output summary
summary

'lander was scheduled to touch down on the lunar surface on 6 September 2019. it crashed when it lost contact with earth and deviated from its intended trajectory. ice could be used as a source of drinking water and hydrogen for fuel and oxygen.'

## **Streamlit Application**

In [12]:
!pip install streamlit -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.1/8.1 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.8/164.8 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.7/4.7 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.1/82.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [13]:
# Creating app file
%%writefile app.py
import streamlit as st
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, T5Config

st.title('Text Summarization')

# Initializing T5 model
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')

user_input = st.text_area("Enter your text here", "")

if st.button("Generate Summary"):
  if user_input:
    # Pre-processing
    preprocessed_text = user_input.strip().replace('\n','')
    t5_input_text = 'summarize: '+ preprocessed_text

    # Tokenization
    tokenized_text = tokenizer.encode(t5_input_text, return_tensors='pt', max_length=512).to(device)

    # Generating summary
    summary_ids = model.generate(tokenized_text, min_length=30, max_length=120)
    summary = tokenizer.decode(summary_ids[0],skip_special_tokens=True)
    st.write("Summary: ")
    st.write(summary)
  else:
    st.warning("Please enter some text.")

Writing app.py


In [14]:
!wget -q -O - ipv4.icanhazip.com

34.125.49.65


In [15]:
!streamlit run app.py & npx localtunnel --port 8501

[..................] | fetchMetadata: sill resolveWithNewModule localtunnel@2.0[0m[K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.125.49.65:8501[0m
[0m
[K[?25hnpx: installed 22 in 8.732s
your url is: https://polite-actors-attend.loca.lt
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Truncation was not explicitly activated but `max