<a href="https://colab.research.google.com/github/thamarai1809/GenAI/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## SENTIMENT ANALYSIS APP

In [None]:
pip install transformers numpy scipy gradio


Collecting gradio
  Downloading gradio-5.22.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [None]:
# Import necessary libraries
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import gradio as gr

# Load pre-trained model and tokenizer from Hugging Face
model_path = "cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Define the sentiment analysis function
def sentiment_analysis(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores_ = output.logits.detach().numpy()[0]
    scores_ = softmax(scores_)
    labels = ['Negative', 'Neutral', 'Positive']
    scores = {label: float(score) for label, score in zip(labels, scores_)}
    return scores

# Create Gradio interface
demo = gr.Interface(
    fn=sentiment_analysis,
    inputs=gr.Textbox(placeholder="Write your text here..."),
    outputs="label",
    examples=[
        ["I'm thrilled about the job offer!"],
        ["The weather today is absolutely beautiful."],
        ["I had a fantastic time at the concert last night."],
        ["I'm so frustrated with this software glitch."],
        ["The customer service was terrible at the store."],
        ["I'm really disappointed with the quality of this product."]
    ],
    title='Sentiment Analysis App',
    description='This app classifies text into positive, neutral, or negative sentiment.'
)

# Launch the app
demo.launch()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/929 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ce1b0960a63b0e2e3f.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## TEXT ANALYSIS SUITE

In [None]:
# Install required libraries
!pip install transformers gradio torch numpy

import gradio as gr
from transformers import pipeline

# Load models for summarization and sentiment analysis
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
sentiment_analyzer = pipeline("sentiment-analysis")

def analyze_text(text):
    # Check if the input text is valid
    if not text or len(text.strip()) == 0:
        return "Please enter a valid paragraph.", "", ""

    # Text Summarization
    try:
        summary = summarizer(text, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
    except Exception as e:
        summary = f"Error in summarization: {str(e)}"

    # Sentiment Analysis on Summary
    try:
        sentiment_result = sentiment_analyzer(summary)[0]
        sentiment = f"{sentiment_result['label']} (Confidence: {sentiment_result['score']:.2f})"
    except Exception as e:
        sentiment = f"Error in sentiment analysis: {str(e)}"

    # Additional Text Analysis
    word_count = len(summary.split())
    sentence_count = summary.count('.') + summary.count('!') + summary.count('?')
    avg_sentence_length = word_count / sentence_count if sentence_count > 0 else 0

    analysis = f"Word Count: {word_count} | Avg. Sentence Length: {avg_sentence_length:.1f} words"

    return summary, sentiment, analysis

# Create Gradio interface
interface = gr.Interface(
    fn=analyze_text,
    inputs=gr.Textbox(lines=5, placeholder="Enter your paragraph here..."),
    outputs=[
        gr.Textbox(label="Summary", lines=3),
        gr.Textbox(label="Sentiment"),
        gr.Textbox(label="Text Analysis")
    ],
    title="Text Analysis Suite",
    description="📝 AI-powered text analysis: Summarization + Sentiment Analysis + Basic Metrics",
)

# Launch the app
interface.launch()




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:  27%|##7       | 440M/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f5ddad78d993a9a928.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## GRAMMAR AND SPELL CHECK

In [None]:
# Install required libraries
!pip install language-tool-python gradio

import gradio as gr
import language_tool_python

# Initialize the LanguageTool instance for English
tool = language_tool_python.LanguageTool('en-US')

def grammar_and_spell_check(text):
    # Check the text for grammar and spelling issues
    matches = tool.check(text)

    # Initialize lists to store detected mistakes and their corrections
    suggestions = []

    # Iterate through detected issues
    for match in matches:
        if len(match.replacements) > 0:
            suggestions.append({
                "error": text[match.offset:match.offset + match.errorLength],
                "suggestion": ", ".join(match.replacements),
                "message": match.message
            })

    # Apply corrections to the text
    corrected_text = language_tool_python.utils.correct(text, matches)

    # Format suggestions for display
    formatted_suggestions = ""
    if suggestions:
        for suggestion in suggestions:
            formatted_suggestions += f"Error: '{suggestion['error']}'\n"
            formatted_suggestions += f"Suggestions: {suggestion['suggestion']}\n"
            formatted_suggestions += f"Message: {suggestion['message']}\n"
            formatted_suggestions += "-" * 40 + "\n"
    else:
        formatted_suggestions = "No errors found!"

    return corrected_text, formatted_suggestions

# Create Gradio interface
interface = gr.Interface(
    fn=grammar_and_spell_check,
    inputs=gr.Textbox(lines=5, placeholder="Enter your text here..."),
    outputs=[
        gr.Textbox(label="Corrected Text", lines=5),
        gr.Textbox(label="Suggestions", lines=10)
    ],
    title="Grammar and Spell Checker",
    description="Enter your text in the box below to check for grammatical errors and spelling mistakes. Suggestions will be provided for corrections."
)

# Launch the app
interface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1a77a87bf82c11998a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




## PARTS OF SPEECH

In [None]:
# Install spaCy if not already installed
!pip install spacy

# Download the English language model for spaCy
!python -m spacy download en_core_web_sm




Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m51.0 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy

def identify_pos(text):
    # Load the spaCy English language model
    nlp = spacy.load("en_core_web_sm")

    # Process the input text
    doc = nlp(text)

    # Extract parts of speech for each token
    pos_tags = []
    for token in doc:
        pos_tags.append({
            "Word": token.text,
            "POS": token.pos_,
            "Explanation": spacy.explain(token.pos_)
        })

    return pos_tags

# Input text from the user
input_text = "She decided to give her biggest comeback ever."

# Identify parts of speech
pos_results = identify_pos(input_text)

# Display results in a formatted way
print("Parts of Speech Analysis:")
print("-" * 40)
for result in pos_results:
    print(f"Word: {result['Word']}")
    print(f"POS Tag: {result['POS']} ({result['Explanation']})")
    print("-" * 40)

Parts of Speech Analysis:
----------------------------------------
Word: She
POS Tag: PRON (pronoun)
----------------------------------------
Word: decided
POS Tag: VERB (verb)
----------------------------------------
Word: to
POS Tag: PART (particle)
----------------------------------------
Word: give
POS Tag: VERB (verb)
----------------------------------------
Word: her
POS Tag: PRON (pronoun)
----------------------------------------
Word: biggest
POS Tag: ADJ (adjective)
----------------------------------------
Word: comeback
POS Tag: NOUN (noun)
----------------------------------------
Word: ever
POS Tag: ADV (adverb)
----------------------------------------
Word: .
POS Tag: PUNCT (punctuation)
----------------------------------------


## QUESTION ANSWERING

In [None]:
# Install required libraries
!pip install transformers gradio

from transformers import pipeline
import gradio as gr

# Load the question-answering pipeline
qa_pipeline = pipeline("question-answering")

def answer_question(context, question):
    # Get the answer from the context based on the question
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Create Gradio interface
interface = gr.Interface(
    fn=answer_question,
    inputs=[
        gr.Textbox(lines=5, placeholder="Enter context here..."),
        gr.Textbox(lines=2, placeholder="Enter your question here...")
    ],
    outputs=gr.Textbox(label="Answer"),
    title="Question Answering System",
    description="Provide a context and a question to get an answer using a pre-trained model."
)

# Launch the app
interface.launch()




No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Device set to use cpu


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://f4f6cbd31fb7067f3b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# Install required libraries
!pip install gradio transformers language-tool-python spacy torch
!python -m spacy download en_core_web_sm

import gradio as gr
from transformers import pipeline
import language_tool_python
import spacy

# Initialize all models
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
sentiment_analyzer = pipeline("sentiment-analysis")
translator = pipeline("translation_en_to_fr", model="Helsinki-NLP/opus-mt-en-fr")
qa_pipeline = pipeline("question-answering")
grammar_tool = language_tool_python.LanguageTool('en-US')
spacy_nlp = spacy.load("en_core_web_sm")

def process_text(text):
    """Perform all analyses on the input text."""

    # Summarization
    summary = summarizer(text, max_length=150, min_length=30)[0]['summary_text']

    # Sentiment Analysis
    sentiment = sentiment_analyzer(summary)[0]

    # Grammar Check
    grammar_matches = grammar_tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, grammar_matches)
    grammar_suggestions = "\n".join([f"• {m.message}: {m.replacements[0]}" for m in grammar_matches])

    # Translation
    translation = translator(text, max_length=100)[0]['translation_text']

    # Parts of Speech Analysis
    doc = spacy_nlp(text)
    pos_tags = "\n".join([f"{token.text}: {token.pos_}" for token in doc])

    return summary, f"{sentiment['label']} ({sentiment['score']:.2f})", corrected_text, grammar_suggestions, translation, pos_tags

def answer_question(context, question):
    """Answer questions from context."""
    result = qa_pipeline(question=question, context=context)
    return result['answer']

# Create Gradio interface with tabs
with gr.Blocks(title="NLP Toolkit") as app:
    gr.Markdown("# 🧠 Ultimate NLP Toolkit")

    with gr.Row():
        input_text = gr.Textbox(label="Input Text", lines=5)
        analyze_button = gr.Button("Analyze")

    with gr.Tab("📝 Text Analysis"):
        summary_output = gr.Textbox(label="Summary", lines=3)
        sentiment_output = gr.Textbox(label="Sentiment")

    with gr.Tab("✍️ Grammar Check"):
        corrected_output = gr.Textbox(label="Corrected Text", lines=5)
        suggestions_output = gr.Textbox(label="Suggestions", lines=5)

    with gr.Tab("🌐 Translation"):
        trans_output = gr.Textbox(label="French Translation", lines=5)

    with gr.Tab("📚 POS Analysis"):
        pos_output = gr.Textbox(label="POS Tags", lines=10)

    with gr.Tab("❓ Q&A System"):
        question_input = gr.Textbox(label="Question", lines=2)
        answer_output = gr.Textbox(label="Answer", lines=2)

    # Connect components
    analyze_button.click(
        fn=process_text,
        inputs=input_text,
        outputs=[summary_output, sentiment_output, corrected_output, suggestions_output, trans_output, pos_output]
    )

    question_input.change(
        fn=answer_question,
        inputs=[input_text, question_input],
        outputs=answer_output
    )

# Launch the app
app.launch()


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu
Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4e6045a66ed2a60076.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
!pip install --upgrade gensim




In [None]:
!pip install numpy scipy




In [None]:
!pip install --upgrade gensim

!pip install gensim nltk



In [None]:
# Install required libraries

from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Sample dataset: List of documents
documents = [
    "Artificial intelligence is transforming the world.",
    "Machine learning is a subset of artificial intelligence.",
    "Natural language processing enables machines to understand human language.",
    "Deep learning is a powerful tool for image recognition.",
    "Robots are becoming more intelligent with advancements in AI."
]

# Step 1: Preprocess the text
def preprocess_text(documents):
    stop_words = set(stopwords.words('english'))
    processed_docs = []
    for doc in documents:
        tokens = word_tokenize(doc.lower())  # Tokenize and convert to lowercase
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]  # Remove stopwords and non-alphanumeric words
        processed_docs.append(filtered_tokens)
    return processed_docs

processed_docs = preprocess_text(documents)

# Step 2: Create a dictionary and corpus
dictionary = corpora.Dictionary(processed_docs)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Step 3: Build the LDA model
lda_model = models.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=15, random_state=42)

# Step 4: Display topics
topics = lda_model.print_topics(num_words=5)
print("Identified Topics:")
for topic_num, topic_words in topics:
    print(f"Topic {topic_num + 1}: {topic_words}")

# Step 5: Assign topics to documents
print("\nDocument Topic Distribution:")
for i, doc_bow in enumerate(corpus):
    doc_topics = lda_model.get_document_topics(doc_bow)
    print(f"Document {i + 1}: {doc_topics}")


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject