<a href="https://colab.research.google.com/github/vedantbhawnani/WebSpider/blob/main/Webspider.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from bs4 import BeautifulSoup, Comment
import requests
import pandas
import time

# Imports for spacy summarisation
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy.lang.en.stop_words import STOP_WORDS

# Imports for nltk summarisation
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('popular', quiet = True)

# common for both spacy and nltk
from heapq import nlargest

# Textrank imports
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# Gemini Imports
import google.generativeai as genai
from google.colab import userdata

import re

In [None]:
from IPython.display import display
from IPython.display import Markdown
import textwrap

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
#@title Extracting text from URLs
def preprocess_text(html_content):
    soup = BeautifulSoup(html_content, 'lxml')

    for element in soup(text=lambda text: isinstance(text, Comment)):
        element.extract()
    for script in soup(["script", "style"]):
        script.extract()

    text = " ".join([p.get_text() for p in soup.find_all(['p', 'h1', 'h2', 'h3'])])
    text = re.sub("(\[[^\]]*\])", "", text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()

    ad_keywords = ["sponsored", "buy now", "limited time", "click here", "tutorial", "save", "subscribe", "subscription", "10%"]

    return text

urls = ['https://www.geeksforgeeks.org/introduction-machine-learning/', 'https://www.ibm.com/topics/machine-learning', 'https://en.wikipedia.org/wiki/Machine_learning']
data = []
for url in urls:
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    data.append(preprocess_text(response.content))

all_text = ". ".join(data)

In [None]:
all_text

'Getting Started with Machine Learning Data Preprocessing Classification & Regression K-Nearest Neighbors (KNN) Support Vector Machines Decision Tree Ensemble Learning Generative Model Time Series Forecasting Clustering Algorithm Convolutional Neural Networks Recurrent Neural Networks Reinforcement Learning Model Deployment and Productionization Advanced Topics An introduction to Machine Learning Arthur Samuel, an early American leader in the field of computer gaming and artificial intelligence, coined the term “Machine Learning ” in 1959 while at IBM. He defined machine learning as “the field of study that gives computers the ability to learn without being explicitly programmed “. However, there is no universally accepted definition for machine learning. Different authors define the term differently. We give below two more definitions. Machine learning is a subfield of artificial intelligence that involves the development of algorithms and statistical models that enable computers to i

In [None]:
#@title TextRank
from sklearn.feature_extraction.text import TfidfVectorizer
import string

sw = set(stopwords.words('english'))

def textrank(text, use_tfidf = True, use_ner = True):
    sentences = sent_tokenize(text)
    sents = [[i.lower() for i in sent.split() if i.lower() not in sw] for sent in sentences]
    # sents = [[s.translate(str.maketrans('','', string.punctuation)) for s in sent] for sent in sents]

    if use_tfidf:
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(sentences)
        sent_vectors = tfidf_matrix.toarray()
    else:
        sent_vectors = []
        for sent in sents:
            if len(sent) != 0:
                v = sum([embeds.get(w, np.zeros((100,))) for w in sent])/len(sent) + 0.001
            else:
                v = np.zeros((100,))
            sent_vectors.append(v)

    sim_matrix = np.zeros([len(sents), len(sents)])
    for i in range(len(sents)):
        for j in range(len(sents)):
            if i != j:
                sim_matrix[i][j] = cosine_similarity(sent_vectors[i].reshape(-1,len(sent_vectors[i])), sent_vectors[j].reshape(-1,len(sent_vectors[j])))[0][0]

    graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(graph)

    if use_ner:
        nlp = spacy.load("en_core_web_sm")
        ner_scores = {}
        for i, sentence in enumerate(sentences):
            doc = nlp(sentence)
            entities = [(ent.text, ent.label_) for ent in doc.ents]
            ner_score = len(entities)
            ner_scores[i] = ner_score

        for i in scores:
            scores[i] = scores[i] + 0.5 * ner_scores.get(i, 0)

    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse = True)
    summary = ' '.join([sent for _, sent in ranked_sentences[:5]])
    return summary

s_text = ""
for d in data:
    s_text += textrank(d, use_ner = False)

textrank(s_text)

"B. Unsupervised learning: Unsupervised learning is a type of machine learning algorithm used to draw inferences from datasets consisting of input data without labeled responses.Machine learning (ML) is a branch of artificial intelligence (AI) and computer science that focuses on the using data and algorithms to enable AI to imitate the way that humans learn, gradually improving its accuracy. A neural network that consists of more than three layers—which would be inclusive of the input and the output—can be considered a deep learning algorithm or a deep neural network.Contents Machine learning Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalize to unseen data, and thus perform tasks without explicit instructions. Classification of Machine Learning Machine learning implementations are classified into four major categories, depending on the nature of the learning “s

##Frequency Based Summarization

In [None]:
#@title Using spacy to generate summary
def spacy_summary(text, per):
    doc = nlp(text)
    sw = set(stopwords.words('english'))
    freq = {}
    for word in doc:
        if word.text.lower() not in sw:
            freq[word.text.lower()] = freq.get(word.text.lower(), 0) + 1
    max_freq = max(freq.values())
    for word in freq.keys():
        freq[word] = freq[word]/max_freq

    sent_scores = {}
    sents = [sent for sent in doc.sents]
    for sent in sents:
        for word in sent:
            if word.text.lower() in freq.keys():
                if sent not in sent_scores:
                    sent_scores[sent] = freq[word.text.lower()]
                else:
                    sent_scores[sent] += freq[word.text.lower()]

    summarized = nlargest(int(per*len(sents)), sent_scores, key = sent_scores.get)
    final_summary = [w.text for w in summarized]
    summary = ' '.join(final_summary)
    return summary

spacy_summary(data[0], 0.5)

'Support Vector Machines Decision Tree Ensemble Learning Generative Model Time Series Forecasting Clustering Algorithm Convolutional Neural Networks Recurrent Neural Networks Reinforcement Learning Model Deployment and Productionization Advanced Topics An introduction to Machine Learning Arthur Samuel, an early American leader in the field of computer gaming and artificial intelligence, coined the term “Machine Learning ” in 1959 while at IBM He defined machine learning as “the field of study that gives computers the ability to learn without being explicitly programmed “ However, there is no universally accepted definition for machine learning Different authors define the term differently We give below two more definitions Machine learning is a subfield of artificial intelligence that involves the development of algorithms and statistical models that enable computers to improve their performance in tasks through experience These algorithms and models are designed to learn from data and

In [None]:
#@title Using nltk to generate summary
def nltk_summary(text, per):
    # text -> text to be summarized
    # per -> percentage of text to return as summary. per should be in decimal values between 0 and 1.
    sw = set(stopwords.words('english'))
    freq = {}
    for word in word_tokenize(text):
        if word.lower() not in sw:
            freq[word.lower()] = freq.get(word.lower(), 0) + 1
    max_freq = max(freq.values())
    for word in freq.keys():
        freq[word] = freq[word]/max_freq

    sent_scores = {}
    sents = [sent for sent in sent_tokenize(text)]
    for sent in sents:
        for word in sent:
            if word.lower() in freq.keys():
                if sent not in sent_scores:
                    sent_scores[sent] = freq[word.lower()]
                else:
                    sent_scores[sent] += freq[word.lower()]

    summarized = nlargest(per, sent_scores, key = sent_scores.get)
    final_summary = [w for w in summarized]
    summary = ' '.join(final_summary)
    return summary

nltk_summary(data[0], 5)

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2024-05-06 09:12:36--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... ^C
unzip:  cannot find or open glove*.zip, glove*.zip.zip or glove*.zip.ZIP.

No zipfiles found.


In [None]:
embeds = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeds[word] = coefs

##LLMS

In [None]:
#@title Gemini
def llm(data):
    per = 0.3
    genai.configure(api_key=userdata.get('API_KEY'))
    model = genai.GenerativeModel('gemini-pro')
    response = model.generate_content([f"Summarize the provided content in the best possible way, without losing the intended meaning. Do not use data from any other sources, apart from the context provided. The response lenegth should be {per*100}% of the original context.", data])
    return response.text

llm(data[2])

'Machine learning (ML) is a field in artificial intelligence that focuses on algorithms that can learn from data and generalize to unseen data. It is widely used in various domains, including natural language processing, computer vision, speech recognition, and medicine. ML models are trained on a dataset and evaluated based on their ability to predict or classify new data points accurately.\n\nOne of the key challenges in ML is overfitting, where a model learns specific details of the training dataset rather than general patterns. To address this, techniques such as regularization, cross-validation, and model selection are employed. Additionally, feature selection and engineering play a crucial role in improving model performance.\n\nML models can be categorized into three main types: supervised learning, unsupervised learning, and reinforcement learning. Supervised learning involves models that learn from labeled data (e.g., classifying emails as spam or not). Unsupervised learning d

In [None]:
#@title Pegasus
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-large")
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
# Load model directly
def pegasus(data):
    inputs = tokenizer(data, padding = "longest", return_tensors = "pt", max_length = 4000)
    summary_ids = model.generate(inputs["input_ids"])
    tokenizer.batch_decode(summary_ids, skip_special_tokens = True, clean_up_tokenization_spaces = False)[0]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# # !pip install git+https://github.com/abenassi/Google-Search-API
# from google import search

# def main():
#     query = input("Enter your search query: ")
#     num_websites = 3

#     search_results = google.search(query, num_websites)
#     print(search_results)
#     all_content = ""

#     for result in search_results:
#         url = result.link
#         print(f"Processing URL: {url}")

#         try:
#             response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
#             extracted_content = preprocess_text(response.content)
#             all_content += extracted_content + " "  # Combine content with a space
#         except Exception as e:
#             print(f"Error processing {url}: {e}")

#     # Summarize the combined content:
#     if all_content:
#         summary = textrank(all_content)  # Example using TextRank
#         print(f"\nCombined Summary:\n{summary}")
#     else:
#         print("No content was extracted.")

# if __name__ == "__main__":
#     main()
