<a href="https://colab.research.google.com/github/sumant1122/PythonBootcamp/blob/main/Session4_PythonBootcamp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP : Summarization

Import required libraries

In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
import re
from nltk.corpus import stopwords

Read the file

In [None]:
df = pd.read_csv("tennis.csv")
df.head()

In [None]:
df['article_text'][1]

Tokenize the sentences

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
    sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x]

##Glove Word Representation

GloVe, short for Global Vectors for Word Representation, is an unsupervised learning algorithm used to generate word embeddings by analyzing global word-word co-occurrence statistics from a large text corpus.
 This method constructs a co-occurrence matrix, where each element represents how often a pair of words appears together within a specific context window.
 The algorithm then factorizes this matrix to obtain lower-dimensional vector representations for each word, capturing both semantic and syntactic relationships.



In [None]:
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")
clean_sentences = [s.lower() for s in clean_sentences]
stop_words = stopwords.words('english')
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

vectors for the sentences

In [None]:
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

similarities between the sentences

In [None]:
sim_mat = np.zeros([len(sentences), len(sentences)])
from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

Creating a graph of about similarities

In [None]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

##Output

In [None]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
for i in range(5):
  print("ARTICLE:")
  print(df['article_text'][i])
  print('\n')
  print("SUMMARY:")
  print(ranked_sentences[i][1])
  print('\n')

# HuggingFace Summarization


In [None]:
text_example = '''The tower is 324 meters (1,063 ft) tall, about the same height
as an 81-storey building, and the tallest structure in Paris. Its base is square,
measuring 125 meters (410 ft) on each side. During its construction, the Eiffel
Tower surpassed the Washington Monument to become the tallest man-made structure
in the world, a title it held for 41 years until the Chrysler Building in New York
City was finished in 1930. It was the first structure to reach a height of 300 meters.
Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is
now taller than the Chrysler Building by 5.2 meters (17 ft). Excluding transmitters,
the Eiffel Tower is the second tallest free-standing structure in France
after the Millau Viaduct.'''

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization", model = "google/pegasus-cnn_dailymail")
summarizer(text_example)

In [None]:
from transformers import pipeline
summarizer = pipeline("summarization", model = "facebook/bart-large-cnn")
summarizer(text_example)

# Classification

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification")
classifier(
    "There is a sale in supermarket",
    candidate_labels=["education", "politics", "business"],
)

In [None]:
pipe = pipeline("text-classification")
pipe("I hate the food here")

In [None]:
from transformers import pipeline

classifier = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
#classifier("This movie is disgustingly good !")

classifier("Director tried too much.")

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="HuggingFaceTB/SmolLM2-360M")
generator(
    "In this course, we will teach you how to",
    max_length=30,
    num_return_sequences=2,
)