## Q1 - Text Preprocessing

### Step 1: Importing libraries and setting up the paragraph

In [None]:
import nltk

# These are the correct names for the required resources
nltk.download('punkt')         # For tokenization
nltk.download('stopwords')     # For stopword removal
nltk.download('wordnet')       # For lemmatization


In [None]:
import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

paragraph = (
    "1 Engaging in sports is crucial for students and children, significantly impacting their overall development "
    "and well-being.2 The physical health benefits include maintaining a healthy weight, enhancing cardiovascular "
    "health, and developing motor skills.3 Furthermore, sports contribute to mental and emotional strength by "
    "reducing stress, boosting self-esteem, and fostering teamwork abilities.4 Additionally, participation in sports "
    "promotes social skills, enhances academic performance, and builds lifelong competencies such as discipline and "
    "resilience.5 Overall, sports are vital for cultivating well-rounded individuals."
)

### Step 2: Converting to lowercase and removing punctuation

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


In [None]:
paragraph_lower = paragraph.lower()
print(paragraph_lower)

no_punct = re.sub(r'[^\w\s]', '', paragraph)
print(no_punct)

### Step 3: Tokenizing words and sentences

In [None]:
import nltk
nltk.download('punkt')


In [None]:
words = word_tokenize(paragraph)
print(words)

sentences = sent_tokenize(paragraph)
print(sentences)

### Step 4: Removing stopwords

In [None]:
stop_words = set(stopwords.words('english'))
filtered = [word for word in words if word.lower() not in stop_words]
print(filtered)

### Step 5: Calculating word frequency

In [None]:
freq = {}
for word in filtered:
    freq[word] = freq.get(word, 0) + 1
print(freq)

## Q2 - Stemming and Lemmatization

### Step 6: Applying Porter Stemmer, Lancaster Stemmer, and WordNet Lemmatizer

In [None]:
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

porter = PorterStemmer()
lancaster = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

porter_result = [porter.stem(word) for word in filtered]
print(porter_result)

lancaster_result = [lancaster.stem(word) for word in filtered]
print(lancaster_result)

lemmatized_result = [lemmatizer.lemmatize(word) for word in filtered]
print(lemmatized_result)

## Q3 - Regex Based Analysis

### Step 7: Finding long words, numbers, and capitalized words

In [None]:
long_words = re.findall(r'\b\w{6,}\b', paragraph)
print(long_words)

numbers = re.findall(r'\d+', paragraph)
print(numbers)

capitalized = re.findall(r'\b[A-Z][a-z]*\b', paragraph)
print(capitalized)

### Step 8: Finding words and those starting with vowels

In [None]:
split_all = re.findall(r'\b[a-zA-Z]+\b', paragraph)
print(split_all)

starts_with_vowel = [word for word in split_all if word[0].lower() in 'aeiou']
print(starts_with_vowel)

## Q4 - Custom Tokenization and Placeholder Replacement

### Step 9: Custom tokenization using regex

In [None]:
def tokenize_custom(text):
    pattern = r"\b(?:\d+\.\d+|\d+|[a-zA-Z]+(?:[-'][a-zA-Z]+)*)\b"
    return re.findall(pattern, text)

print(tokenize_custom(paragraph))

### Step 10: Replacing emails, URLs, and phone numbers with placeholders

In [None]:
sample = "Contact us at abc123@gmail.com or visit https://www.abc123.com for info. You can also call 123456789."

def replace_sensitive_info(text):
    text = re.sub(r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b', '<EMAIL>', text)
    text = re.sub(r'\b(?:https?://|www\.)\S+\b', '<URL>', text)
    text = re.sub(r'\b\d{9,}\b', '<PHONE>', text)
    return text

print(replace_sensitive_info(sample))