In [None]:
# Q1 - Text Preprocessing
import nltk
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Download required resources
nltk.download('punkt')
nltk.download('stopwords')

# Define the paragraph
paragraph = (
    "1 Engaging in sports is crucial for students and children, significantly impacting their overall development "
    "and well-being.2 The physical health benefits include maintaining a healthy weight, enhancing cardiovascular "
    "health, and developing motor skills.3 Furthermore, sports contribute to mental and emotional strength by "
    "reducing stress, boosting self-esteem, and fostering teamwork abilities.4 Additionally, participation in sports "
    "promotes social skills, enhances academic performance, and builds lifelong competencies such as discipline and "
    "resilience.5 Overall, sports are vital for cultivating well-rounded individuals."
)

# Step 1: Convert to lowercase and remove punctuation
paragraph_lower = re.sub(r'[^\w\s]', '', paragraph.lower())
print(paragraph_lower)

# Step 2: Tokenize words and sentences
tokens = word_tokenize(paragraph)
sentences = sent_tokenize(paragraph)

print(tokens)
print(sentences)

# Step 3: Split words using regex
words_split = re.split(r'\W+', paragraph)
print(words_split)

# Step 4: Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in tokens if word not in stop_words]
print(filtered_words)

# Step 5: Calculate word frequency
word_freq = {}
for word in filtered_words:
    if word in word_freq:
        word_freq[word] += 1
    else:
        word_freq[word] = 1
print(word_freq)


In [None]:
# Q2 - Stemming and Lemmatization
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer

# Step 1: Find alphabet words
word_alphabet = re.findall(r'\b\w\b', paragraph)
print(word_alphabet)

# Step 2: Filter words without stopwords
filtered_words = [word for word in tokens if word not in stop_words]
print(filtered_words)

# Step 3: Apply stemming and lemmatization
ps = PorterStemmer()
ls = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

ps_stemmed = [ps.stem(word) for word in filtered_words]
ls_stemmed = [ls.stem(word) for word in filtered_words]
lemmatized = [lemmatizer.lemmatize(word) for word in filtered_words]

print(ps_stemmed)
print(ls_stemmed)
print(lemmatized)


In [None]:
# Q3 - Vectorization and Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Example texts
texts = [
    "Great product, works as expected!",
    "Worst purchase ever, broke after one use.",
    "Fantastic service and very fast delivery."
]

# Step 1: Bag of Words (BoW)
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(texts)
print(vectorizer.get_feature_names_out())
print(bow.toarray())

# Step 2: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(texts)
print(tfidf_vectorizer.get_feature_names_out())
print(tfidf.toarray())

# Step 3: Create a dictionary of word occurrence in texts
top_words = {}
for i, text in enumerate(texts):
    words = text.split()
    for word in words:
        if word in top_words:
            top_words[word].append(i)
        else:
            top_words[word] = [i]
print(top_words)

In [None]:
# Q4 - Text Similarity
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import re

text1 = "Artificial Intelligence (AI) allows machines to mimic human intelligence by interpreting data and making choices. AI is revolutionizing sectors with use in healthcare, finance, and manufacturing."
text2 = "Blockchain is a distributed digital record that keeps track of transactions in a secure and open manner. It is perhaps best recognized for enabling cryptocurrencies such as Bitcoin, but it is also used in supply chain management."

# Step 1: Preprocess and tokenize the texts
preprocess_text1 = re.sub(r'[^\w\s]', '', text1.lower())
preprocess_text2 = re.sub(r'[^\w\s]', '', text2.lower())

token1 = word_tokenize(preprocess_text1)
token2 = word_tokenize(preprocess_text2)

print(token1)
print(token2)

# Step 2: Jaccard Similarity
jaccard = len(set(token1).intersection(set(token2))) / len(set(token1).union(set(token2)))
print("Jaccard Similarity:", jaccard)

# Step 3: Cosine Similarity using sklearn
cosine_sim = cosine_similarity([preprocess_text1], [preprocess_text2])
print("Cosine Similarity:", cosine_sim[0][0])

In [None]:
# Q5 - Sentiment Analysis
from textblob import TextBlob
from wordcloud import WordCloud
import matplotlib.pyplot as plt

review = "The service was excellent and the staff was friendly."
blob = TextBlob(review)
polarity = blob.sentiment.polarity
print("Polarity:", polarity)
subjectivity = blob.sentiment.subjectivity
print("Subjectivity:", subjectivity)

# Step 1: Determine sentiment
if polarity > 0.1:
    sentiment = "Positive"
    print("Sentiment:", sentiment)
elif polarity < -0.1:
    sentiment = "Negative"
    print("Sentiment:", sentiment)
else:
    sentiment = "Neutral"
    print("Sentiment:", sentiment)

# Step 2: WordCloud visualization
wordcloud = WordCloud().generate(review)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Q6 - Text Generation with LSTM
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
import numpy as np

text = "Artificial Intelligence is about making machines that can think and learn like people."

# Step 1: Tokenize the paragraph
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

tokens = tokenizer.texts_to_sequences([text])[0]
input_sequences = []
for i in range(1, len(tokens)):
    input_sequences.append(tokens[:i+1])

max_seq_len = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')

# Step 2: Create and train the model
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = np.eye(total_words)[y]

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 10, input_length=max_seq_len-1),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=200, verbose=1)

# Step 3: Text generation function
def generate_text(seed_text, next_words=15):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)[0]
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += " " + word
                break
    return seed_text

print(generate_text("Artificial Intelligence"))
print(generate_text("AI"))