<a href="https://colab.research.google.com/github/sumyuck/ML-learning/blob/main/nlc/NLC_p_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLC Practical - 1
Roll No: 23BCE298

Name: Samyak Jain

Aim: Introduction to text processing libraries

In [None]:
import re

# Example text
text = "Contact us at support@example.com or call us at 123-456-7890."

# Pattern matching: Find all email addresses in the text
email_pattern = r'\S+@\S+'
emails = re.findall(email_pattern, text)
print("Found email addresses:", emails)

# Text extraction: Extract the phone number
phone_pattern = r'\d{3}-\d{3}-\d{4}'
phone_number = re.search(phone_pattern, text)
if phone_number:
  print("Found phone number:", phone_number.group(0))
else:
  print("Phone number not found.")

Found email addresses: ['support@example.com']
Found phone number: 123-456-7890


In [None]:
import nltk

# Download necessary NLTK data (if you haven't already)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

text = "NLTK is a powerful library for natural language processing. It helps with tokenization, stemming, and lemmatization."

# Tokenization
words = word_tokenize(text)
sentences = sent_tokenize(text)

print("Word tokens:", words)
print("Sentence tokens:", sentences)

# Stemming
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words]
print("Stemmed words:", stemmed_words)

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
print("Lemmatized words:", lemmatized_words)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Word tokens: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '.', 'It', 'helps', 'with', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', '.']
Sentence tokens: ['NLTK is a powerful library for natural language processing.', 'It helps with tokenization, stemming, and lemmatization.']
Stemmed words: ['nltk', 'is', 'a', 'power', 'librari', 'for', 'natur', 'languag', 'process', '.', 'it', 'help', 'with', 'token', ',', 'stem', ',', 'and', 'lemmat', '.']
Lemmatized words: ['NLTK', 'is', 'a', 'powerful', 'library', 'for', 'natural', 'language', 'processing', '.', 'It', 'help', 'with', 'tokenization', ',', 'stemming', ',', 'and', 'lemmatization', '.']


In [None]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

text = "Apple is looking at buying U.K. startup for $1 billion."

# Process the text
doc = nlp(text)

# Named Entity Recognition (NER)
print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# Dependency Parsing
print("\nDependency Parsing:")
for token in doc:
    print(f"{token.text} ({token.pos_}) - {token.dep_} -> {token.head.text}")

Named Entities:
Apple (ORG)
U.K. (GPE)
$1 billion (MONEY)

Dependency Parsing:
Apple (PROPN) - nsubj -> looking
is (AUX) - aux -> looking
looking (VERB) - ROOT -> looking
at (ADP) - prep -> looking
buying (VERB) - pcomp -> at
U.K. (PROPN) - nsubj -> startup
startup (VERB) - ccomp -> buying
for (ADP) - prep -> startup
$ (SYM) - quantmod -> billion
1 (NUM) - compound -> billion
billion (NUM) - pobj -> for
. (PUNCT) - punct -> looking


In [None]:
from textblob import TextBlob
from googletrans import Translator

text = "TextBlob is a great library for sentiment analysis and translation."

# Sentiment Analysis
blob = TextBlob(text)
sentiment = blob.sentiment
print(f"Sentiment of the text: {sentiment}")

# Translation
translator = Translator()
translated_text = translator.translate(text_to_translate, dest='es')
print(f"Translated text (Spanish): {translated_text}")

Sentiment of the text: Sentiment(polarity=0.8, subjectivity=0.75)
Translated text (Spanish): Translated(src=en, dest=es, text=¿Hola, cómo estás?, pronunciation=None, extra_data="{'confiden...")


#Some Common Pre-Processing Steps

In [1]:
#import required stuff
import pandas as pd
import numpy as np

import re
import nltk
import spacy
import string

In [5]:
#Read training data
trdf=pd.read_csv('train.csv', header='infer')

print(trdf.head(3))
trdf.info()

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   

   target  
0       1  
1       1  
2       1  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [6]:
#Convert everything in text in lower case
trdf['lowered_text']=trdf['text'].str.lower()

print(trdf['lowered_text'].head(3))

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
Name: lowered_text, dtype: object


In [7]:
#Removal of punctuation

punctuation=string.punctuation

print(type(punctuation), punctuation)

mapping=str.maketrans("","",punctuation)

print(type(mapping), mapping)

print(trdf['lowered_text'].head(10))
trdf['lowered_text']=trdf["lowered_text"].str.translate(mapping)
print(trdf['lowered_text'].head(10))

<class 'str'> !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
<class 'dict'> {33: None, 34: None, 35: None, 36: None, 37: None, 38: None, 39: None, 40: None, 41: None, 42: None, 43: None, 44: None, 45: None, 46: None, 47: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 91: None, 92: None, 93: None, 94: None, 95: None, 96: None, 123: None, 124: None, 125: None, 126: None}
0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
5    #rockyfire update => california hwy. 20 closed...
6    #flood #disaster heavy rain causes flash flood...
7    i'm on top of the hill and i can see a fire in...
8    there's an emergency evacuation happening now ...
9    i'm afraid that the tornado is coming to our a...
Name: lowered_text, dtype: object
0    our deeds are the reason

In [10]:
#Let us have a look at standard list of stopwords
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

print(type(stopwords.words()), len(stopwords.words()))


print(type(stopwords.words('english')), len(stopwords.words('english'))) #list, 179 stopwords

print(stopwords.words('english'))
stopwords_eng=stopwords.words('english')

<class 'list'> 11009
<class 'list'> 198
['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', '

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
#function to remove stop words
def remove_stopwords(in_str):
    new_str=''
    words=in_str.split()
    for tx in words:
        if tx not in stopwords_eng:
            new_str=new_str + tx + " "
    return new_str

trdf['lowered_text_stop_removed']=trdf["lowered_text"].apply(lambda x: remove_stopwords(x))

print(trdf["lowered_text_stop_removed"].head(10))

0        deeds reason earthquake may allah forgive us 
1               forest fire near la ronge sask canada 
2    residents asked shelter place notified officer...
3    13000 people receive wildfires evacuation orde...
4    got sent photo ruby alaska smoke wildfires pou...
5    rockyfire update california hwy 20 closed dire...
6    flood disaster heavy rain causes flash floodin...
7                          im top hill see fire woods 
8    theres emergency evacuation happening building...
9                       im afraid tornado coming area 
Name: lowered_text_stop_removed, dtype: object


In [20]:
#Now Stemming using PorterStemmer
from nltk.stem.porter import PorterStemmer

stemmer=PorterStemmer()

print(trdf["lowered_text_stop_removed"].head(5))

def do_stemming(in_str):
    new_str=""
    for word in in_str.split():
        new_str=new_str + stemmer.stem(word) + " "
    return new_str

trdf["Stemmed"]=trdf["lowered_text_stop_removed"].apply(lambda x: do_stemming(x))

print(trdf["Stemmed"].head(5))

0        deeds reason earthquake may allah forgive us 
1               forest fire near la ronge sask canada 
2    residents asked shelter place notified officer...
3    13000 people receive wildfires evacuation orde...
4    got sent photo ruby alaska smoke wildfires pou...
Name: lowered_text_stop_removed, dtype: object
0           deed reason earthquak may allah forgiv us 
1                forest fire near la rong sask canada 
2    resid ask shelter place notifi offic evacu she...
3    13000 peopl receiv wildfir evacu order califor...
4    got sent photo rubi alaska smoke wildfir pour ...
Name: Stemmed, dtype: object
