Text Analysis
1. Extract the Sample document and apply following document preprocessing methods: Tokenization, POS tagging, Stopwords removal, stemming and lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document Frequency.

# **Import the necessary libraries** 

In [5]:
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
import gradio as gr

In [9]:
nltk.download('punkt')

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


False

# taken sent as our input sentence

In [10]:
sent = "Hello I am Gayatri Deshmukh. I am from Nanded District. I am a computer engineer."

## 1) Performing the Tokenization

In [11]:
print(word_tokenize(sent))
print(sent_tokenize(sent))

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Sumit/nltk_data'
    - 'C:\\Users\\Sumit\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data'
    - 'C:\\Users\\Sumit\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data'
    - 'C:\\Users\\Sumit\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\Sumit\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [7]:
#defining functions for tokenization
def word_tokenization(input):
  token = word_tokenize(input)
  return token

def sent_tokenization(input):
  token = sent_tokenize(input)
  return token

In [8]:
demo1 = gr.Interface(fn=word_tokenization, inputs="text", outputs="text")
demo1.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [9]:
demo2 = gr.Interface(fn=sent_tokenization, inputs="text", outputs="text")
demo2.launch()

Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




In [10]:
demo = gr.Blocks()
with demo:
    gr.Markdown("# Tokenization")
    with gr.Tabs():
        with gr.TabItem("Word Tokenization"):
            with gr.Row():
                word_ip = gr.Textbox(label="Input Data")
                word_op = gr.Textbox(label="Output Tokens")
            word_button = gr.Button("Generate Tokens")
        with gr.TabItem("Sentence Tokenization"):
            with gr.Row():
                sent_ip = gr.Textbox(label="Input Data")
                sent_op = gr.Textbox(label="Output Tokens")
            sent_button = gr.Button("Generate Tokens")
        
    word_button.click(word_tokenization, inputs=word_ip, outputs=word_op)
    sent_button.click(sent_tokenization, inputs=sent_ip, outputs=sent_op)

demo.launch()


Thanks for being a Gradio user! If you have questions or feedback, please join our Discord server and chat with us: https://discord.gg/feTf9x3ZSB
Running on local URL:  http://127.0.0.1:7862

To create a public link, set `share=True` in `launch()`.




## 2) StopWords Removal

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/kkw/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
stop_words = stopwords.words('english')

In [13]:
token = word_tokenize(sent)
cleaned_token = []
for word in token:
  if word not in stop_words:
    cleaned_token.append(word)

print("Unclean version ", token)
print("Clean Version", cleaned_token)

Unclean version  ['Hello', 'I', 'am', 'Gayatri', 'Deshmukh', '.', 'I', 'am', 'from', 'Nanded', 'District', '.', 'I', 'am', 'a', 'computer', 'engineer', '.']
Clean Version ['Hello', 'I', 'Gayatri', 'Deshmukh', '.', 'I', 'Nanded', 'District', '.', 'I', 'computer', 'engineer', '.']


In [14]:
#defining a function for stopword removal
def remove_stop(text):
    return ",".join([word for word in str(text).split() if word not in stop_words])

In [15]:
demo3 = gr.Interface(fn=remove_stop, inputs="text", outputs="text")
demo3.launch()

Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




## 3) Stemmming

In [16]:
snowball_stemmer = SnowballStemmer('english')

In [17]:
text = "Hello I am Gayatri. I am Engineering student."

In [18]:
word_tokens = nltk.word_tokenize(text)
stemmed_word = [snowball_stemmer.stem(word) for word in word_tokens]

In [19]:
print(stemmed_word)

['hello', 'i', 'am', 'gayatri', '.', 'i', 'am', 'engin', 'student', '.']


In [20]:
#defining a function for stemming
def stemming(text):
  word_tokens = nltk.word_tokenize(text)
  stemmed_word = [snowball_stemmer.stem(word) for word in word_tokens]
  return stemmed_word

In [21]:
demo4 = gr.Interface(fn=stemming, inputs="text", outputs="text")
demo4.launch()

Running on local URL:  http://127.0.0.1:7864

To create a public link, set `share=True` in `launch()`.




## 4) Lemmatization

In [22]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/kkw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [23]:
word_tokens = nltk.word_tokenize(text)
lemmatizer = WordNetLemmatizer()
lemmatizer_word = [lemmatizer.lemmatize(word) for word in word_tokens]

In [24]:
print("rocks :", lemmatizer.lemmatize("rocks"))

rocks : rock


In [25]:
print(lemmatizer_word)

['Hello', 'I', 'am', 'Gayatri', '.', 'I', 'am', 'Engineering', 'student', '.']


In [26]:
# definig a function for lemmatization
def lemmatization(text):
    words = nltk.word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return lemmatized_words

In [27]:
demo5 = gr.Interface(fn=lemmatization, inputs="text", outputs="text")
demo5.launch()

Running on local URL:  http://127.0.0.1:7865

To create a public link, set `share=True` in `launch()`.




## POS Tagging

In [28]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/kkw/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [29]:
text = "They are having their lunch"


In [30]:
word_token = nltk.word_tokenize(text)


In [31]:
pos_tag = nltk.pos_tag(word_token)
print(pos_tag)

[('They', 'PRP'), ('are', 'VBP'), ('having', 'VBG'), ('their', 'PRP$'), ('lunch', 'NN')]


## TF IDF Vectorizer

In [32]:
d0 = "Good Morning"
d1 = "Do daily exercise in the morning "
d2 = "exercise is good for health"

In [33]:
series = [d0, d1, d2]

In [34]:
tfidf = TfidfVectorizer()

In [35]:
result = tfidf.fit_transform(series)

In [36]:
print("Word Indexing ")
print(tfidf.vocabulary_)

Word Indexing 
{'good': 4, 'morning': 8, 'do': 1, 'daily': 0, 'exercise': 2, 'in': 6, 'the': 9, 'is': 7, 'for': 3, 'health': 5}


In [37]:
print("tf-idf values :: ")
print(result)

tf-idf values :: 
  (0, 8)	0.7071067811865476
  (0, 4)	0.7071067811865476
  (1, 9)	0.4403620672313486
  (1, 6)	0.4403620672313486
  (1, 2)	0.3349067026613031
  (1, 0)	0.4403620672313486
  (1, 1)	0.4403620672313486
  (1, 8)	0.3349067026613031
  (2, 5)	0.49047908420610337
  (2, 3)	0.49047908420610337
  (2, 7)	0.49047908420610337
  (2, 2)	0.3730219858594306
  (2, 4)	0.3730219858594306


In [38]:
print("tf-idf in matrix form")
print(result.toarray())

tf-idf in matrix form
[[0.         0.         0.         0.         0.70710678 0.
  0.         0.         0.70710678 0.        ]
 [0.44036207 0.44036207 0.3349067  0.         0.         0.
  0.44036207 0.         0.3349067  0.44036207]
 [0.         0.         0.37302199 0.49047908 0.37302199 0.49047908
  0.         0.49047908 0.         0.        ]]


In [39]:
final = gr.Blocks()
with final:
    gr.Markdown("# NLP Operations")
    with gr.Tabs():
        with gr.TabItem("Word Tokenization"):
            with gr.Row():
                word_ip = gr.Textbox(label="Input Data")
                word_op = gr.Textbox(label="Output Tokens")
            word_button = gr.Button("Generate Tokens")
        with gr.TabItem("Sentence Tokenization"):
            with gr.Row():
                sent_ip = gr.Textbox(label="Input Data")
                sent_op = gr.Textbox(label="Output Tokens")
            sent_button = gr.Button("Generate Tokens")
        
        with gr.TabItem("Stopwords Removal"):
            with gr.Row():
                sremo_ip = gr.Textbox(label="Input Data")
                sremo_op = gr.Textbox(label="Processed Data")
            sremo_button = gr.Button("Preocessed Data")

        with gr.TabItem("Stemming"):
            with gr.Row():
                stem_ip = gr.Textbox(label="Input Data")
                stem_op = gr.Textbox(label="Output Stem")
            stem_button = gr.Button("Generate Stem")
            
        with gr.TabItem("Lemmatization"):
            with gr.Row():
                lemma_ip = gr.Textbox(label="Input Data")
                lemma_op = gr.Textbox(label="Output Lemma")
            lemma_button = gr.Button("Generate Lemma")
        
        
        
    word_button.click(word_tokenization, inputs=word_ip, outputs=word_op)
    sent_button.click(sent_tokenization, inputs=sent_ip, outputs=sent_op)
    sremo_button.click(remove_stop, inputs=sremo_ip, outputs=sremo_op)
    stem_button.click(stemming, inputs=stem_ip, outputs=stem_op)
    lemma_button.click(lemmatization, inputs=lemma_ip, outputs=lemma_op)

final.launch()

Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.


