In [7]:
# Import necessary libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK dependencies
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:

from google.colab import drive
drive.mount('/content/drive')

filepath = '/content/drive/My Drive/cleaned_data.csv'
data = pd.read_csv(filepath)
data


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,Product Name,Cleaned Description
0,Cloud Secure Agent,informatica cloud secure agent lightweight pro...
1,MicroStrategy BI Platform,microstrategy platform supports interactive da...
2,Signature Services,signature services package key services includ...
3,VizLib,vizlib fully supported thirdparty visualisatio...
4,PowerCenter Connector,informatica powercenter powerconnect adapters ...
...,...,...
1489,SQL Toolbelt,essential industrystandard tools sql server de...
1490,SHAREFILE,sharefile secure content collaboration file sh...
1491,GridGain,gridgain middlewareclass inmemory computing pl...
1492,Deploy and Inventory,pdq deploy inventory allow automate patch mana...


In [9]:

# Text Preprocessing: Tokenization, lowercasing, and removing irrelevant stop words
stop_words = set(stopwords.words('english'))

# Additional domain-specific words that should be excluded
custom_stopwords = set(["platform", "services", "solution", "business", "cloud", "data"])

# Preprocess function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize the text
    tokens = [word for word in tokens if word not in stop_words and word not in custom_stopwords]  # Remove stopwords
    return " ".join(tokens)

# Apply preprocessing
data['processed_description'] = data['Cleaned Description'].apply(preprocess_text)

# Display the processed data
print(data[['Product Name', 'processed_description']].head())

                Product Name  \
0         Cloud Secure Agent   
1  MicroStrategy BI Platform   
2         Signature Services   
3                     VizLib   
4      PowerCenter Connector   

                               processed_description  
0  informatica secure agent lightweight program r...  
1  microstrategy supports interactive dashboards ...  
2  signature package key including limited consul...  
3  vizlib fully supported thirdparty visualisatio...  
4  informatica powercenter powerconnect adapters ...  


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

# N-Gram + TF-IDF (1, 3) captures meaningful phrases and technical terms
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 3), stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(data['processed_description'])

# Display the shape of the TF-IDF matrix and sample of feature names
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)
print("Top Features:", tfidf_vectorizer.get_feature_names_out()[:20])


TF-IDF Matrix Shape: (1494, 1000)
Top Features: ['ability' 'able' 'accelerate' 'access' 'accessing' 'account' 'accounts'
 'accuracy' 'accurate' 'achieve' 'acquired' 'action' 'actionable'
 'activities' 'activity' 'add' 'additional' 'address' 'addresses'
 'administration']


In [11]:
from gensim.models import Word2Vec

# Tokenize each description into words for training Word2Vec
data['tokenized_description'] = data['processed_description'].apply(word_tokenize)

# Train Word2Vec on the tokenized descriptions
w2v_model = Word2Vec(sentences=data['tokenized_description'], vector_size=100, window=5, min_count=2, workers=4)

# Example: Check embeddings for specific product-related words like 'analytics'
print("Word Embeddings for 'analytics':", w2v_model.wv['analytics'])


Word Embeddings for 'analytics': [-0.4761081   0.5958502   0.1832276  -0.08203638  0.06705578 -0.85160244
  0.21753314  1.0176791  -0.2952499  -0.1753881  -0.24361968 -0.78349
  0.086216    0.09888961  0.03706609 -0.36820158  0.15415414 -0.7676885
  0.15587625 -0.986524    0.30084443  0.16017541  0.34534752 -0.07428773
 -0.21183078 -0.1232057  -0.4649     -0.28937036 -0.47609714  0.16065511
  0.5864829   0.10412958  0.13270596 -0.34322745 -0.28229636  0.5511374
  0.09483065 -0.46241528 -0.2847271  -0.97729945 -0.05040683 -0.3370872
 -0.28508118  0.15992494  0.4125564  -0.14550075 -0.5302411   0.06752918
  0.4391401   0.36427152  0.33561248 -0.42112073 -0.0487316  -0.203861
 -0.44011328  0.39641657  0.3243997   0.02523267 -0.4705747   0.10375152
  0.08144609  0.04631816 -0.3673479  -0.00826789 -0.5177695   0.32741952
  0.1383204   0.29053482 -0.6006676   0.6488828  -0.33393657  0.5000058
  0.55412596 -0.26536283  0.34902397  0.22356127 -0.05765345 -0.1598339
 -0.35120767  0.14320122 -0.

In [12]:
from textblob import TextBlob

# Function to get sentiment polarity
def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

data['sentiment_score'] = data['processed_description'].apply(get_sentiment)

# Display sentiment scores
print(data[['Product Name', 'sentiment_score']].head())


                Product Name  sentiment_score
0         Cloud Secure Agent         0.533333
1  MicroStrategy BI Platform         0.065000
2         Signature Services         0.244643
3                     VizLib         0.259091
4      PowerCenter Connector         0.000000


In [13]:
from sklearn.decomposition import LatentDirichletAllocation as LDA

# Fit LDA model with 5 topics
lda = LDA(n_components=5, random_state=42)
lda_topics = lda.fit_transform(tfidf_matrix)

# Display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
display_topics(lda, tfidf_feature_names, 10)


Topic 0:
database oracle service applications language software source management open distributed
Topic 1:
remote virtual machine file windows enterprise premium zos security ibm
Topic 2:
network management performance software monitoring support provides application automation storage
Topic 3:
customer content experience management sales salesforce marketing digital analytics intelligence
Topic 4:
management software server sap microsoft applications application enterprise address provides


In [14]:
# Define domain-specific keywords (e.g., AI-related, analytics-related)
keywords = ['ai', 'machine learning', 'big data', 'analytics', 'iot', 'security', 'cloud']

# Count occurrences of keywords in each description
def count_keywords(text):
    return sum([text.count(word) for word in keywords])

data['keyword_count'] = data['processed_description'].apply(count_keywords)

# Display keyword counts
print(data[['Product Name', 'keyword_count']].head())


                Product Name  keyword_count
0         Cloud Secure Agent              0
1  MicroStrategy BI Platform              1
2         Signature Services              0
3                     VizLib              0
4      PowerCenter Connector              0


In [15]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Named Entity Recognition
def extract_entities(text):
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    entities = nltk.ne_chunk(tagged)
    return entities

data['named_entities'] = data['processed_description'].apply(extract_entities)

# Part of Speech (POS) Tagging
data['pos_tags'] = data['processed_description'].apply(lambda x: nltk.pos_tag(nltk.word_tokenize(x)))

# Display NER and POS tags
print(data[['Product Name', 'named_entities', 'pos_tags']].head())


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


                Product Name  \
0         Cloud Secure Agent   
1  MicroStrategy BI Platform   
2         Signature Services   
3                     VizLib   
4      PowerCenter Connector   

                                      named_entities  \
0  [(informatica, JJ), (secure, NN), (agent, NN),...   
1  [(microstrategy, NN), (supports, NNS), (intera...   
2  [(signature, NN), (package, NN), (key, JJ), (i...   
3  [(vizlib, NN), (fully, RB), (supported, VBD), ...   
4  [(informatica, NN), (powercenter, NN), (powerc...   

                                            pos_tags  
0  [(informatica, JJ), (secure, NN), (agent, NN),...  
1  [(microstrategy, NN), (supports, NNS), (intera...  
2  [(signature, NN), (package, NN), (key, JJ), (i...  
3  [(vizlib, NN), (fully, RB), (supported, VBD), ...  
4  [(informatica, NN), (powercenter, NN), (powerc...  


In [18]:
data

Unnamed: 0,Product Name,Cleaned Description,processed_description,tokenized_description,sentiment_score,keyword_count,named_entities,pos_tags
0,Cloud Secure Agent,informatica cloud secure agent lightweight pro...,informatica secure agent lightweight program r...,"[informatica, secure, agent, lightweight, prog...",0.533333,0,"[(informatica, JJ), (secure, NN), (agent, NN),...","[(informatica, JJ), (secure, NN), (agent, NN),..."
1,MicroStrategy BI Platform,microstrategy platform supports interactive da...,microstrategy supports interactive dashboards ...,"[microstrategy, supports, interactive, dashboa...",0.065000,1,"[(microstrategy, NN), (supports, NNS), (intera...","[(microstrategy, NN), (supports, NNS), (intera..."
2,Signature Services,signature services package key services includ...,signature package key including limited consul...,"[signature, package, key, including, limited, ...",0.244643,0,"[(signature, NN), (package, NN), (key, JJ), (i...","[(signature, NN), (package, NN), (key, JJ), (i..."
3,VizLib,vizlib fully supported thirdparty visualisatio...,vizlib fully supported thirdparty visualisatio...,"[vizlib, fully, supported, thirdparty, visuali...",0.259091,0,"[(vizlib, NN), (fully, RB), (supported, VBD), ...","[(vizlib, NN), (fully, RB), (supported, VBD), ..."
4,PowerCenter Connector,informatica powercenter powerconnect adapters ...,informatica powercenter powerconnect adapters ...,"[informatica, powercenter, powerconnect, adapt...",0.000000,0,"[(informatica, NN), (powercenter, NN), (powerc...","[(informatica, NN), (powercenter, NN), (powerc..."
...,...,...,...,...,...,...,...,...
1489,SQL Toolbelt,essential industrystandard tools sql server de...,essential industrystandard tools sql server de...,"[essential, industrystandard, tools, sql, serv...",0.000000,0,"[(essential, JJ), (industrystandard, NN), (too...","[(essential, JJ), (industrystandard, NN), (too..."
1490,SHAREFILE,sharefile secure content collaboration file sh...,sharefile secure content collaboration file sh...,"[sharefile, secure, content, collaboration, fi...",0.059524,1,"[(sharefile, JJ), (secure, NN), (content, JJ),...","[(sharefile, JJ), (secure, NN), (content, JJ),..."
1491,GridGain,gridgain middlewareclass inmemory computing pl...,gridgain middlewareclass inmemory computing he...,"[gridgain, middlewareclass, inmemory, computin...",0.148667,3,"[(gridgain, NN), (middlewareclass, NN), (inmem...","[(gridgain, NN), (middlewareclass, NN), (inmem..."
1492,Deploy and Inventory,pdq deploy inventory allow automate patch mana...,pdq deploy inventory allow automate patch mana...,"[pdq, deploy, inventory, allow, automate, patc...",0.000000,0,"[(pdq, JJ), (deploy, NN), (inventory, NN), (al...","[(pdq, JJ), (deploy, NN), (inventory, NN), (al..."
