## TEXT ANALYSIS

In [None]:
# Import necessary libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# Sample document
document = """Data science is an inter-disciplinary field that uses scientific methods,
processes, algorithms, and systems to extract knowledge and insights from structured and
unstructured data. Data science is related to data mining, machine learning, and big data."""

# Tokenization
tokens = word_tokenize(document)

# POS tagging
pos_tags = nltk.pos_tag(tokens)

# Stop word removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

# TF-IDF representation
corpus = [document]
vectorizer = TfidfVectorizer()
tfidf_representation = vectorizer.fit_transform(corpus)
tfidf_features = vectorizer.get_feature_names_out()

# Print results
print("Original Document:")
print(document)
print("\nTokenization:")
print(tokens)
print("\nPOS Tagging:")
print(pos_tags)
print("\nStop Word Removal:")
print(filtered_tokens)
print("\nStemming:")
print(stemmed_tokens)
print("\nLemmatization:")
print(lemmatized_tokens)
print("\nTF-IDF Representation:")
print(tfidf_representation.toarray())
print("\nTF-IDF Features:")
print(tfidf_features)


Original Document:
Data science is an inter-disciplinary field that uses scientific methods, processes, algorithms, and systems to extract knowledge and insights from structured and unstructured data. Data science is related to data mining, machine learning, and big data.

Tokenization:
['Data', 'science', 'is', 'an', 'inter-disciplinary', 'field', 'that', 'uses', 'scientific', 'methods', ',', 'processes', ',', 'algorithms', ',', 'and', 'systems', 'to', 'extract', 'knowledge', 'and', 'insights', 'from', 'structured', 'and', 'unstructured', 'data', '.', 'Data', 'science', 'is', 'related', 'to', 'data', 'mining', ',', 'machine', 'learning', ',', 'and', 'big', 'data', '.']

POS Tagging:
[('Data', 'NNP'), ('science', 'NN'), ('is', 'VBZ'), ('an', 'DT'), ('inter-disciplinary', 'JJ'), ('field', 'NN'), ('that', 'WDT'), ('uses', 'VBZ'), ('scientific', 'JJ'), ('methods', 'NNS'), (',', ','), ('processes', 'NNS'), (',', ','), ('algorithms', 'NN'), (',', ','), ('and', 'CC'), ('systems', 'NNS'), ('t

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
