In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

dataset = pd.read_csv('../../data/processed/processed_data.csv')


skills_text = dataset['Skills Required'].str.replace(', ', ' ')

# Vectorize skills using CountVectorizer
vectorizer = CountVectorizer()
skills_matrix = vectorizer.fit_transform(skills_text)

# Apply Latent Dirichlet Allocation (LDA) for topic modeling
num_topics = 3  # You can adjust the number of topics based on your analysis
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(skills_matrix)

# Extract the most important words (skills) for each topic
feature_names = vectorizer.get_feature_names_out()
topic_top_words = {}
for topic_idx, topic in enumerate(lda.components_):
    top_words_idx = topic.argsort()[:-10:-1]  # Extract top 10 words for each topic
    top_words = [feature_names[i] for i in top_words_idx]
    topic_top_words[f'Topic {topic_idx + 1}'] = top_words

# Display the most important words (skills) for each topic
for topic, top_words in topic_top_words.items():
    print(f"{topic}: {', '.join(top_words)}")


Topic 1: data, learning, analysis, python, visualization, machine, statistical, deep, sql
Topic 2: data, modeling, sql, cloud, computing, etl, database, hadoop, design
Topic 3: data, spark, hadoop, sql, python, etl, aws, nosql, warehousing
