In [1]:
from pprint import pformat
import os
import numpy as np
import pandas as pd
from datetime import datetime
import util

In [2]:
def init_data():
    global udemy_courses_df, roadmap_concepts_df, roadmaps_df, roadmap_topics_df

    df_path = "../../embedding-generation/data/"
    udemy_courses_file = "udemy_courses_final.csv"
    roadmap_nodes_file = "roadmap_nodes_final.csv"
    udemy_courses_df = pd.read_csv(df_path + udemy_courses_file)
    roadmap_nodes_df = pd.read_csv(df_path + roadmap_nodes_file)
    roadmap_concepts_df = roadmap_nodes_df[roadmap_nodes_df["type"] == "concept"].copy()
    roadmap_concepts_df.reset_index(inplace=True)
    roadmap_topics_df = roadmap_nodes_df[roadmap_nodes_df["type"] == "topic"].copy()
    roadmap_topics_df.reset_index(inplace=True)

    roles = [
        "AI Data Scientist",
        "Android Developer",
        "Backend Developer",
        "Blockchain Developer",
        "Devops Engineer",
        "Frontend Developer",
        "Full Stack Developer",
        "Game Developer",
        "QA Engineer",
        "UX Designer",
    ]

    roadmaps_dict = {"id": np.arange(1, len(roles) + 1), "name": roles}
    roadmaps_df = pd.DataFrame.from_dict(roadmaps_dict)
    roadmaps_df.set_index("id", inplace=True)

    print("Data is initialized using " + udemy_courses_file + " and " + roadmap_nodes_file)
    print("Total number of roadmap concepts: " + str(roadmap_concepts_df.shape[0]))
    print("Total number of courses: " + str(udemy_courses_df.shape[0]))
    print("Career Roles: \n" + pformat(list(zip(np.arange(1, len(roles) + 1), roles))))

    # return (udemy_courses_df, roadmap_nodes_df, roadmap_concepts_df, roadmaps_df)

In [3]:
init_data()


Data is initialized using udemy_courses_final.csv and roadmap_nodes_final.csv
Total number of roadmap concepts: 869
Total number of courses: 453
Career Roles: 
[(1, 'AI Data Scientist'),
 (2, 'Android Developer'),
 (3, 'Backend Developer'),
 (4, 'Blockchain Developer'),
 (5, 'Devops Engineer'),
 (6, 'Frontend Developer'),
 (7, 'Full Stack Developer'),
 (8, 'Game Developer'),
 (9, 'QA Engineer'),
 (10, 'UX Designer')]


In [4]:
roadmap_concepts_df.head()

Unnamed: 0,index,id,name,content,type
0,1,10000,linear algebra calculus mathematical analysis,- [Mathematics for Machine Learning Specializa...,concept
1,2,10001,differential calculus,- [Algebra and Differential Calculus for Data ...,concept
2,4,10100,statistics clt,- [Introduction to Statistics](https://imp.i38...,concept
3,5,10101,hypothesis testing,- [Introduction to Statistical Analysis: Hypot...,concept
4,6,10102,probability sampling,- [Probability and Statistics: To p or not to ...,concept


In [26]:
def get_parent_topics(concept_id):
    parent_topic_list = []
    while concept_id % 100 != concept_id:
        concept_id = int(concept_id / 100)
        if concept_id < 100:
            break
        parent_topic_list.append(concept_id)              
    return parent_topic_list

In [27]:
topic_concept_count = {topic_id: 0 for topic_id in roadmap_topics_df['id']}

for concept_id in roadmap_concepts_df['id']:
    parent_topics = get_parent_topics(concept_id)
    for topic_id in parent_topics:
        if topic_id in topic_concept_count:
            topic_concept_count[topic_id] += 1

In [33]:
topic_concept_count_df = pd.DataFrame.from_dict(topic_concept_count, orient='index', columns=['Total Concepts'])

# Sort the DataFrame by the 'Total Concepts' column in descending order
sorted_topic_concept_count_df = topic_concept_count_df.sort_values(by='Total Concepts', ascending=False)

In [35]:
sorted_topic_concept_count_df

Unnamed: 0,Total Concepts
903,31
804,29
900,28
409,27
1005,22
...,...
80501,1
503,1
100103,1
501,1


In [32]:
topic_concept_count

{100: 2,
 101: 6,
 102: 2,
 103: 3,
 104: 1,
 105: 1,
 106: 1,
 107: 1,
 200: 2,
 201: 6,
 202: 4,
 203: 9,
 20300: 3,
 20304: 3,
 204: 3,
 205: 15,
 20502: 4,
 20503: 4,
 20505: 4,
 206: 4,
 207: 3,
 208: 5,
 209: 8,
 20900: 5,
 210: 2,
 211: 4,
 212: 2,
 213: 3,
 300: 6,
 301: 3,
 302: 10,
 303: 9,
 304: 2,
 305: 3,
 306: 5,
 307: 5,
 308: 7,
 309: 14,
 30907: 7,
 310: 4,
 311: 4,
 31101: 2,
 312: 10,
 313: 3,
 314: 5,
 316: 6,
 317: 2,
 318: 2,
 319: 3,
 320: 2,
 321: 1,
 322: 4,
 325: 9,
 32500: 5,
 400: 6,
 401: 21,
 40109: 12,
 4010902: 8,
 4010903: 2,
 402: 3,
 403: 14,
 40300: 3,
 40301: 3,
 404: 4,
 405: 7,
 40500: 3,
 40501: 4,
 406: 1,
 407: 1,
 408: 3,
 409: 27,
 40900: 3,
 40906: 5,
 40907: 4,
 40908: 3,
 40909: 3,
 40910: 4,
 410: 8,
 500: 5,
 501: 1,
 502: 5,
 503: 1,
 504: 3,
 505: 9,
 507: 8,
 508: 14,
 50807: 7,
 509: 6,
 510: 4,
 511: 3,
 512: 8,
 513: 4,
 514: 4,
 515: 5,
 516: 5,
 517: 5,
 518: 3,
 519: 2,
 520: 4,
 521: 4,
 600: 6,
 601: 6,
 602: 3,
 603: 5,
 604:

In [25]:
parent_topic_list

[101]

In [42]:
roadmap_concepts_id_list= roadmap_concepts_df["id"].values
roadmap_topics_id_list= roadmap_topics_df["id"].values

In [43]:
def calculate_topic_coverage(concept_id_list, roadmap_topics_id_list, roadmap_concepts_id_list):

    topic_concept_count = {topic_id: 0 for topic_id in roadmap_topics_id_list}
    topic_coverage_count = topic_concept_count.copy()

    for concept_id in roadmap_concepts_id_list:
        parent_topics = get_parent_topics(concept_id)
        for topic_id in parent_topics:
            if topic_id in topic_concept_count:
                topic_concept_count[topic_id] += 1

    for concept_id in concept_id_list:
        parent_topics = get_parent_topics(concept_id)
        for topic_id in parent_topics:
            if topic_id in topic_coverage_count:
                topic_coverage_count[topic_id] += 1

    # Calculate the coverage percentage
    coverage_percentage = {topic_id: (count / topic_concept_count[topic_id]) * 100 for topic_id, count in topic_coverage_count.items() if topic_concept_count[topic_id] > 0}
    
    # Filter topics with at least 40% coverage and sort them
    covered_topics = {topic_id: coverage for topic_id, coverage in coverage_percentage.items() if coverage >= 40}
    sorted_covered_topics = sorted(covered_topics.items(), key=lambda x: x[1], reverse=True)
    
    return [topic_id for topic_id, coverage in sorted_covered_topics]

In [44]:
concept_id_list = [10100, 1000000, 1000101, 10201, 10302]

selected_topic_id_list = calculate_topic_coverage(concept_id_list, roadmap_topics_id_list, roadmap_concepts_id_list)

In [45]:
selected_topic_id_list

[100, 102]