#Parsing

In [None]:
import nltk
from nltk import CFG

grammar = CFG.fromstring("""
    S -> NP VP
    NP -> Det N
    VP -> V NP
    Det -> 'the'
    N -> 'cat' | 'dog'
    V -> 'chased'
""")

parser = nltk.ShiftReduceParser(grammar)

sentence = "the cat chased the dog".split()

def shift_reduce_parsing(parser, sentence):
    stack = []
    buffer = sentence.copy()
    step = 1

    print("Initial State:")
    print(f"Stack: {stack}")
    print(f"Buffer: {buffer}")
    print("-" * 40)

    while buffer:
        print(f"Step {step}:")
        step += 1

        stack.append(buffer.pop(0))
        print(f"Shift: {stack[-1]}")
        print(f"Stack: {stack}")
        print(f"Buffer: {buffer}")

        reduced = True
        while reduced:
            reduced = False
            for production in grammar.productions():
                rhs = list(production.rhs())
                if stack[-len(rhs):] == rhs:
                    del stack[-len(rhs):]
                    stack.append(production.lhs())
                    print(f"Reduce: {production}")
                    print(f"Stack: {stack}")
                    print(f"Buffer: {buffer}")
                    reduced = True
                    break

        print("-" * 40)

    if stack == [grammar.start()]:
        print("Parsing successful! Final parse tree:")
        for tree in parser.parse(sentence):
            tree.pretty_print()
    else:
        print("Parsing failed. No valid parse tree found.")

shift_reduce_parsing(parser, sentence)

Initial State:
Stack: []
Buffer: ['the', 'cat', 'chased', 'the', 'dog']
----------------------------------------
Step 1:
Shift: the
Stack: ['the']
Buffer: ['cat', 'chased', 'the', 'dog']
Reduce: Det -> 'the'
Stack: [Det]
Buffer: ['cat', 'chased', 'the', 'dog']
----------------------------------------
Step 2:
Shift: cat
Stack: [Det, 'cat']
Buffer: ['chased', 'the', 'dog']
Reduce: N -> 'cat'
Stack: [Det, N]
Buffer: ['chased', 'the', 'dog']
Reduce: NP -> Det N
Stack: [NP]
Buffer: ['chased', 'the', 'dog']
----------------------------------------
Step 3:
Shift: chased
Stack: [NP, 'chased']
Buffer: ['the', 'dog']
Reduce: V -> 'chased'
Stack: [NP, V]
Buffer: ['the', 'dog']
----------------------------------------
Step 4:
Shift: the
Stack: [NP, V, 'the']
Buffer: ['dog']
Reduce: Det -> 'the'
Stack: [NP, V, Det]
Buffer: ['dog']
----------------------------------------
Step 5:
Shift: dog
Stack: [NP, V, Det, 'dog']
Buffer: []
Reduce: N -> 'dog'
Stack: [NP, V, Det, N]
Buffer: []
Reduce: NP -> Det N

# Extractive Summarization

In [None]:
import nltk
import heapq
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

nltk.download("punkt")
nltk.download("stopwords")
nltk.download('punkt_tab')

def summarize_text(text, num_sentences=2):
    sentences = sent_tokenize(text)

    stop_words = set(stopwords.words("english"))
    word_frequencies = {}

    for word in word_tokenize(text):
        word = word.lower()
        if word not in stop_words and word.isalnum():
            if word in word_frequencies:
                word_frequencies[word] += 1
            else:
                word_frequencies[word] = 1

    max_freq = max(word_frequencies.values())
    for word in word_frequencies:
        word_frequencies[word] /= max_freq

    sentence_scores = {}
    for sent in sentences:
        for word in word_tokenize(sent.lower()):
            if word in word_frequencies:
                if sent in sentence_scores:
                    sentence_scores[sent] += word_frequencies[word]
                else:
                    sentence_scores[sent] = word_frequencies[word]

    summary_sentences = heapq.nlargest(num_sentences, sentence_scores, key=sentence_scores.get)
    summary = " ".join(summary_sentences)

    return summary

text = """Deep learning is a powerful subset of artificial intelligence that focuses on training neural networks with multiple layers to analyze and interpret complex data. It has revolutionized fields like computer vision, natural language processing, and speech recognition by enabling machines to learn from vast amounts of data. Techniques such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs) are widely used for tasks like image classification, language translation, and predictive analytics. Deep learning models excel at identifying patterns and making decisions with minimal human intervention, driving innovations in autonomous vehicles, healthcare diagnostics, and personalized recommendations. Its ability to process unstructured data makes it a cornerstone of modern AI advancements.
"""

summary = summarize_text(text, num_sentences=2)
print("Summary:\n", summary)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Summary:
 Techniques such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs) are widely used for tasks like image classification, language translation, and predictive analytics. Deep learning is a powerful subset of artificial intelligence that focuses on training neural networks with multiple layers to analyze and interpret complex data.


In [None]:
!pip install sumy

Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl.metadata (7.5 kB)
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: breadability, docopt
  Building wheel for breadability (setup.py) ... [?25l[?25hdone
  Created wheel for breadability: filename=brea

# Abstractive Summarization

In [None]:
import nltk
import re
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer
from transformers import pipeline

def text_summarization(text, num_sentences=3):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = LsaSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join(str(sentence) for sentence in summary)

def text_parsing(text):
    sentences = nltk.sent_tokenize(text)
    words = [nltk.word_tokenize(sent) for sent in sentences]
    return sentences, words

def abstractive_summarization(text, max_length=100, min_length=30):
    summarizer = pipeline("summarization", model="t5-small")
    summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return summary[0]["summary_text"]

if __name__ == "__main__":
    sample_text = """
    Deep learning is a powerful subset of artificial intelligence that focuses on training neural networks with multiple layers to analyze and interpret complex data. It has revolutionized fields like computer vision, natural language processing, and speech recognition by enabling machines to learn from vast amounts of data. Techniques such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs) are widely used for tasks like image classification, language translation, and predictive analytics. Deep learning models excel at identifying patterns and making decisions with minimal human intervention, driving innovations in autonomous vehicles, healthcare diagnostics, and personalized recommendations. Its ability to process unstructured data makes it a cornerstone of modern AI advancements.
    """

    print("Original Text:")
    print(sample_text)

    print("\nExtractive Summarized Text:")
    print(text_summarization(sample_text))

    print("\nAbstractive Summarized Text:")
    print(abstractive_summarization(sample_text))

    sentences, words = text_parsing(sample_text)
    print("\nParsed Sentences:")
    print(sentences)
    print("\nParsed Words:")
    print(words)

Original Text:

    Deep learning is a powerful subset of artificial intelligence that focuses on training neural networks with multiple layers to analyze and interpret complex data. It has revolutionized fields like computer vision, natural language processing, and speech recognition by enabling machines to learn from vast amounts of data. Techniques such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs) are widely used for tasks like image classification, language translation, and predictive analytics. Deep learning models excel at identifying patterns and making decisions with minimal human intervention, driving innovations in autonomous vehicles, healthcare diagnostics, and personalized recommendations. Its ability to process unstructured data makes it a cornerstone of modern AI advancements.
    

Extractive Summarized Text:
Deep learning is a powerful subset of artificial intelligence that focuses on training neural networks with multiple layers to anal

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cpu


deep learning is a powerful subset of artificial intelligence that focuses on training neural networks with multiple layers to analyze and interpret complex data . it has revolutionized fields like computer vision, natural language processing, and speech recognition . techniques such as convolutional neural networks (CNNs) and recurrent neural networks are widely used for tasks like image classification, language translation, predictive analytics .

Parsed Sentences:
['\n    Deep learning is a powerful subset of artificial intelligence that focuses on training neural networks with multiple layers to analyze and interpret complex data.', 'It has revolutionized fields like computer vision, natural language processing, and speech recognition by enabling machines to learn from vast amounts of data.', 'Techniques such as convolutional neural networks (CNNs) and recurrent neural networks (RNNs) are widely used for tasks like image classification, language translation, and predictive analytic

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

#Text Classification

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from datasets import load_dataset

dataset = load_dataset("liar")

X = dataset["train"]["statement"]
y = dataset["train"]["label"]

fake_labels = {"pants-fire", "false", "barely-true"}
y = [1 if label in fake_labels else 0 for label in y]  # 1 = Fake, 0 = True

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print(metrics.classification_report(y_test, y_pred, labels=[0, 1], target_names=["True", "Fake"]))

example_statements = [
    "The government is secretly controlling the weather through satellites.",
    "The unemployment rate increased by 5% last quarter, according to official records.",
    "A study shows that drinking coffee reduces the risk of heart disease by 20%."
]

predicted_labels = model.predict(example_statements)

for text, label in zip(example_statements, predicted_labels):
    status = "Fake" if label == 1 else "True"
    print(f"Statement: {text}\nPredicted Label: {status}\n")


Accuracy: 1.0000
              precision    recall  f1-score   support

        True       1.00      1.00      1.00      2054
        Fake       0.00      0.00      0.00         0

    accuracy                           1.00      2054
   macro avg       0.50      0.50      0.50      2054
weighted avg       1.00      1.00      1.00      2054

Statement: The government is secretly controlling the weather through satellites.
Predicted Label: True

Statement: The unemployment rate increased by 50% last quarter, according to official records.
Predicted Label: True

Statement: A study shows that drinking coffee reduces the risk of heart disease by 20%.
Predicted Label: True



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
!pip install sklearn_crfsuite



#NER

In [None]:
import nltk
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import sklearn_crfsuite
from sklearn_crfsuite import metrics

dataset = load_dataset("conll2003", split="train")

tag_mapping = dataset.features["ner_tags"].feature

def convert_tags(tag_ids):
    """Convert numeric tag IDs to human-readable labels."""
    return [tag_mapping.int2str(tag_id) for tag_id in tag_ids]

def word_features(sentence, i):
    """Extracts features for a word in a sentence."""
    word = sentence[i]
    return {
        'word': word.lower(),
        'is_first': i == 0,
        'is_last': i == len(sentence) - 1,
        'is_capitalized': word[0].isupper(),
        'is_all_caps': word.isupper(),
        'is_all_lower': word.islower(),
        'prefix-2': word[:2],
        'prefix-3': word[:3],
        'suffix-2': word[-2:],
        'suffix-3': word[-3:],
        'word_is_digit': word.isdigit(),
    }

sentences = []
ner_tags = []
for item in dataset:
    words = item["tokens"]
    tags = convert_tags(item["ner_tags"])
    sentences.append(words)
    ner_tags.append(tags)

X = [[word_features(sent, i) for i in range(len(sent))] for sent in sentences]
y = ner_tags

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
)
crf.fit(X_train, y_train)

y_pred = crf.predict(X_test)

print("NER Classification Report:")
print(metrics.flat_classification_report(y_test, y_pred, labels=tag_mapping.names, digits=4))

example_sentence = ["Barack", "Obama", "was", "born", "in", "Hawaii", "and", "worked", "at", "the", "White", "House", "."]

example_features = [word_features(example_sentence, i) for i in range(len(example_sentence))]

predicted_tags = crf.predict([example_features])[0]

print("\nNamed Entity Recognition Results:")
for word, tag in zip(example_sentence, predicted_tags):
    print(f"{word}: {tag}")


Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn_crfsuite)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn_crfsuite-0.5.0
NER Classification Report:
              precision    recall  f1-score   support

           O     0.9939    0.9950    0.9945     34233
       B-PER     0.9010    0.9075    0.9043      1254
       I-PER     0.9177    0.9610    0.9389       847
       B-ORG     0.8560    0.8531    0.8545      1212
  