In [None]:
import google.generativeai as genai
import pathlib
import textwrap
import numpy as np
import tensorflow as tf
from PIL import Image
import cv2
import re
from IPython.display import display
from IPython.display import Markdown
import pandas as pd
import time
import os
import json
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


!pip install -q -U google-generativeai


GOOGLE_API_KEY = os.environ.get('API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)
model =  genai.GenerativeModel('gemini-pro')



# Cleaning and Processing Skills:

In [None]:
### processing the first batch of users using gemeni API

df = pd.read_csv('web_scraping_results.csv')
df1 = pd.DataFrame(columns=['id', 'url', 'skills'])

prompt_base = "return a list of 3 skills that the following courses might teach, each should be 3 words or less, the response should be in this form: 1: the first skill, 2: the second skill, 3: the third skill"
for i in range(0,2200):
    like_name = df['like_name'][i]
    if pd.notna(like_name):  # Check if the value is not NaN
        s = str(like_name).replace('[', '').replace(']', '').replace(':','')
        if len(s) !=0:
          prompt = prompt_base+''+s
          df1.loc[i, 'id'] = df.loc[i, 'id']
          df1.loc[i, 'url'] = df.loc[i, 'url']
          response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                candidate_count=1,
                temperature=0.55)
          )
          df1['skills'][i] = response.text
          time.sleep(1.5)

In [None]:
### processing thes econd batch of users using gemeni API
df2 = pd.DataFrame(columns=['id', 'url', 'skills'])

prompt_base = "return a list of 3 skills that the following courses might teach, each should be 3 words or less, the response should be in this form: 1: the first skill, 2: the second skill, 3: the third skill"
for i in range(2199,len(df)):
    like_name = df['like_name'][i]
    if pd.notna(like_name):  # Check if the value is not NaN
        s = str(like_name).replace('[', '').replace(']', '').replace(':','')
        if len(s) !=0:
          prompt = prompt_base+''+s
          df2.loc[i, 'id'] = df.loc[i, 'id']
          df2.loc[i, 'url'] = df.loc[i, 'url']
          response = model.generate_content(
            prompt,
            generation_config=genai.types.GenerationConfig(
                candidate_count=1,
                temperature=0.55)
          )
          df2['skills'][i] = response.text
          time.sleep(1.5)

In [None]:
### printing an example
pd.set_option('display.max_colwidth', -1)
df2.head(30)

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,id,url,skills
0,hungwenc,https://www.linkedin.com/in/hungwenc,"1: Java Development, 2: Cloud Development, 3: Eclipse MicroProfile\n1: Python Programming, 2: Efficiency, 3: Problem Solving\n1: Cloud Computing, 2: Hadoop, 3: Apache Spark"
1,andremagni,https://www.linkedin.com/in/andremagni,1: Data management\n2: Data analysis\n3: Machine learning
2,adithya-ramesh-949496137,https://www.linkedin.com/in/adithya-ramesh-949496137,1: Data analysis\n2: Data visualization\n3: Statistical modeling
3,grant-neuman,https://www.linkedin.com/in/grant-neuman,1: Machine learning algorithms\n2: Data modeling\n3: Recommender systems
5,bertrand-chauvaux-1a562164,https://www.linkedin.com/in/bertrand-chauvaux-1a562164,"1: Identity Management, 2: Access Control, 3: Authentication\n1: Software Development, 2: Design Patterns, 3: Agile Methodologies\n1: Embedded Programming, 2: C Language, 3: Real-Time Systems"
7,shivanispv,https://www.linkedin.com/in/shivanispv,1: Exception handling\n2: Spring MVC framework\n3: Functional programming
8,berdikhan,https://www.linkedin.com/in/berdikhan,1: Mobile app design\n2: WatchOS app development\n3: Xcode development
9,nitish-kumar-vaja-aaa095b0,https://www.linkedin.com/in/nitishkumarvaja,1: Finite element analysis\n2: Productivity management\n3: Injection molding design
10,wonjunjang,https://www.linkedin.com/in/wonjunjang,1: Python optimization\n2: .NET UI development\n3: Java concurrency
11,sachi-sharma,https://www.linkedin.com/in/sachi-sharma,1: Java web development\n2: Angular development\n3: Servlet development


In [None]:
#### converting to excel file to save
d1 = df1[df1['id'] != '']
df2 =df2[df2['id'] != '']

df1['skills'] = df1['skills'].str.replace('\n', ' ')
df1['skills'] = df2['skills'].str.replace('\n', ' ')

# Function to extract skills based on the number
def extract_skill(row, num):
    if pd.notna(row):
        try:
            skills = row.split(', ')
            skill = skills[num - 1].split(': ')[1].strip().replace(' 2','') if len(skills) >= num else None
        except IndexError:
            skill = None
    else:
        skill = None
    return skill

# Create new columns skill_1, skill_2, skill_3
for i in range(1, 4):
    df1[f'skill_{i}'] = df1['skills'].apply(lambda x: extract_skill(x, i))
    df2[f'skill_{i}'] = df2['skills'].apply(lambda x: extract_skill(x, i))
df1.drop(columns=['skill_1','skill_2','skill_3'],inplace=True)

print(df1.dtypes)
df1.to_excel('batch1.xlsx', index=False)
df2.to_excel('batch2.xlsx', index=False)


id         object
url        object
skills     object
skill_1    object
skill_2    object
skill_3    object
dtype: object


In [None]:
### when converting to excel some columns had mistakes:
skills1 = pd.read_excel('batch1.xlsx')
skills2 = pd.read_excel('batch2.xlsx')

skills_df = pd.concat([skills1, skills2])
skills_df = skills_df[skills_df['id'] != '']
skills_df['skills'] = skills_df['skills'].str.replace('\n', ' ').str.replace(',',' ')

# Define function to identify incorrect rows
def identify_incorrect_rows(row):
    count_1 = row.count('1:')
    count_2 = row.count('2:')
    count_3 = row.count('3:')
    return count_1 > 1 or count_2 > 1 or count_3 > 1

skills_df['incorrect'] = skills_df['skills'].apply(identify_incorrect_rows)
incorrect_rows = skills_df[skills_df['incorrect'] == True]
# Display DataFrame

# Define function to fix incorrect rows
def fix_incorrect_rows(row):
    # Use regular expression to extract parts starting with "1:"
    matches = re.findall(r'(1:[^\d]*?)(?=\d+:|$)', row)

    # Initialize a counter
    counter = 1

    # Iterate over matches and modify the first match to keep "1:" and change subsequent matches
    fixed_skills = []
    for match in matches:
        if counter == 1:
            fixed_skills.append(match)
        else:
            fixed_skills.append(match.replace('1:', f'{counter}:'))
        counter += 1

    return ' '.join(fixed_skills)

# Fix incorrect rows
skills_df.loc[skills_df['incorrect'], 'skills'] = skills_df.loc[skills_df['incorrect'], 'skills'].apply(fix_incorrect_rows)



# Function to extract skills
def extract_skills(row, num):
    skills = row.split(f"{num}:")
    if len(skills) > 1:
        return skills[1].split(f"{num+1}:")[0].strip()
    else:
        return None

# Create new columns for skills
for i in range(1, 4):
    skills_df[f'skill_{i}'] = skills_df['skills'].apply(lambda x: extract_skills(x, i).lower() if extract_skills(x, i) else None)
skills_df.drop(columns=['incorrect'],inplace=True)
# Display the updated DataFrame
print(skills_df)
skills_df.to_csv('skills_df.csv', index=False)

# Preparing Other Attributes:

In [None]:
df = pd.read_csv("scraping_users.csv")
df.columns

# Assuming the "courses" column contains JSON strings, parse it into dictionaries


# Assuming the "courses" column contains JSON strings, parse it into dictionaries
df['courses_title'] = df['сourses'].apply(lambda x: json.loads(x))

# Extract only the "title" from each dictionary in the "courses" column
df['courses_title'] = df['courses_title'].apply(lambda x: [item['title'] for item in x])

def extract_field(education):
    if education:  # Check if the list is not empty
        education_list = json.loads(education)
        fields = [item['field'] for item in education_list if 'field' in item]
        return fields if fields else None
    return []

# Function to extract duration_short from experience
def extract_duration_short(experience):
    if experience:  # Check if the list is not empty
        experience_list = json.loads(experience)
        durations = [item['duration_short'] for item in experience_list if 'duration_short' in item]
        return durations if durations else None
    return []

df['degree_field'] = df['education'].apply(extract_field)
print(df['degree_field'].head())
df['exp_duration']=df['experience'].apply(extract_duration_short)
print(df['exp_duration'].head())
# Create a new column 'degree' by applying the function to the 'education' column
#df['degree'] = df['education'].apply(extract_degree)

# Select only the columns 'courses_title' and 'id'
df_selected = df[['id', 'courses_title','degree_field','exp_duration']]


# Function to translate duration to months
def translate_to_months(duration):
    total_months = 0
    if duration is not None and duration != []:
        for dur in duration:
            if ("year" in dur or "years" in dur) and ("month" in dur or "months" in dur):
                # Extract years and months
                years, months = map(int, dur.split()[::2])
                total_months += years * 12 + months
            elif "less than a year" in dur:
                total_months += 6
            elif ("year" in dur) or ("years" in dur):
                # Extract years
                total_months += int(dur.split()[0]) * 12
            elif ("month" in dur) or ("months" in dur):
                # Extract months
                total_months += int(dur.split()[0])
    return total_months

# Define UDF
df_selected['exp_duration'] = df_selected['exp_duration'].apply(translate_to_months)
final_df = df_selected[['id','courses_title','degree_field','exp_duration']]
final_df['degree_field'] = final_df['degree_field'].apply(lambda x: [] if x is None else x)
# Save the selected columns to a new CSV file
final_df.to_csv('clustering_df.csv', index=False)

In [None]:

clustering_df = pd.read_csv('clustering_df.csv')
# Drop rows where id is empty
clustering_df = clustering_df[clustering_df['id'] != '']

# Merge DataFrames based on 'id'
merged_df = pd.merge(clustering_df, skills_df, on='id', how='inner')
merged_df.drop(columns=['skills'],inplace=True)

users = pd.read_csv('scraping_users.csv')
users = users[['id','certifications']]

# Define function to extract titles
def extract_titles(cert_list):
    try:
        if cert_list:
            if isinstance(cert_list, str):
                cert_list = json.loads(cert_list)
            return [cert['title'] for cert in cert_list]
        else:
            return []
    except Exception as e:
        print(f"Error processing row: {e}")
        return []

# Extract titles and create 'cert_titles' column
users['cert_titles'] = users['certifications'].apply(extract_titles)

users.drop(columns=['certifications'],inplace=True)

model_df = pd.merge(merged_df, users, on='id', how='inner')
model_df['skills'] = model_df.apply(lambda row: [row['skill_1'], row['skill_2'], row['skill_3']], axis=1)

# Drop the original columns 'skill_1', 'skill_2', and 'skill_3'
model_df.drop(columns=['skill_1', 'skill_2', 'skill_3'], inplace=True)


# Performing Clustering of Skills:

In [None]:
random.seed(42)
skill1 = model_df['skills'].tolist()
flat_skill1 = [item for sublist in skill1 for item in sublist]
print(len(flat_skill1))
unique_values = list(set(flat_skill1))
unique_values = [value for value in unique_values if value is not None]

vectorizer = TfidfVectorizer(max_df=0.4, min_df=2, stop_words='english', lowercase=True)
X = vectorizer.fit_transform(unique_values)

# Step 3: Clustering
num_clusters = 150  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=200, n_init=20)
kmeans.fit(X)
# Step 4: Interpretation and printing clusters
cluster_labels = kmeans.labels_  # Get cluster labels for each data point

clustered_skills = [[] for _ in range(num_clusters)]
for skill, label in zip(unique_values, cluster_labels):
    clustered_skills[label].append(skill)

# Print each cluster separately
for cluster_id, skills_in_cluster in enumerate(clustered_skills):
    print(f"Cluster {cluster_id}:")
    for skill in skills_in_cluster:
        print(skill)

    print()

11325


KeyboardInterrupt: 

Exception ignored in: 'sklearn.cluster._k_means_common._relocate_empty_clusters_sparse'
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/numpy/core/multiarray.py", line 346, in where
    @array_function_from_c_func_and_dispatcher(_multiarray_umath.where)
KeyboardInterrupt: 


Cluster 0:
ngrx state management
redux architecture
redux state management

Cluster 1:
hybrid networking infrastructure
networking fundamentals
linux networking configuration
networking skills
linux networking
azure networking design
networking design
networking
networking concepts
networking configuration
azure networking
cloud networking
wireless networking
networking  communication  job search
enterprise networking
storage networking
low-power wireless networking
cisco networking
azure networking solutions design

Cluster 2:
accounting analysis
semantic analysis
ai-assisted data analysis
fea analysis
log analysis
finite element analysis (fea)
roi analysis
construction productivity analysis
finite element analysis
time series analysis
python data analysis
rpa use case analysis
data analysis expertise
material analysis
exploratory data analysis
vulnerability analysis
biometric analysis
clustering  association  data analysis
data analysis with r
data analysis and visualization
threat a

We noticed that one cluster is very large and uniformative, this problem could not be fixed with tf-idf clustering:

In [None]:
# Find the index of the list with the maximum length
max_index = max(range(len(clustered_skills)), key=lambda i: len(clustered_skills[i]))

# Print the index of the list with the maximum length
print("Index of cluster with maximum length:", max_index)

Index of list with maximum length: 10


We will perform reclustering of this large, general cluster:

In [None]:

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Identify the cluster you want to redistribute
cluster_to_redistribute = max_index  # Adjust the cluster ID as needed

# Extract skills from the identified cluster
skills_to_redistribute = clustered_skills[cluster_to_redistribute]
#print(len(skills_to_redistribute))
# Vectorize all skills in the clusters
cluster_vectorized = {}
for index, cluster in enumerate(clustered_skills):
    if index != cluster_to_redistribute:
        cluster_vectorized[index] = vectorizer.transform(cluster)

# Initialize dictionary to store transformed skills
skill_transformed_dict = {}

for i, skill in enumerate(skills_to_redistribute):
    X = vectorizer.transform([skill])
    max_sim = -1
    max_cluster = None
    for index, cluster in enumerate(clustered_skills):
        if index != cluster_to_redistribute:
           X_cluster = cluster_vectorized[index]
           sim = cosine_similarity(X, X_cluster).mean()
           if sim > max_sim:
               max_sim = sim
               max_cluster = index
    skill_transformed_dict[skill] = max_cluster




In [None]:
copy_list = clustered_skills.copy()
for skill, cluster in skill_transformed_dict.items():
  copy_list[cluster].append(skill)

copy_list = [cluster for i,cluster in enumerate(copy_list) if i!=max_index]


print(copy_list)

[['parsing protobuf messages', 'sending messages', 'rigging', 'programmatic advertising', 'unlearn silence', 'servlets', 'facial retouching', 'rpm distribution', 'merge conflict resolution', 'conflict resolution', 'bias mitigation', 'logical reasoning', 'attributes', 'asking questions', 'organizational alignment', 'foster inclusivity', 'uncovering unconscious bias', 'generating ideas', 'mindset', 'bookkeeping', 'laser cutting', 'messaging with jms', 'focus', 'lombok annotations', 'linear algebra', 'enhanced concentration', 'trauma-informed care', 'recognize hazards', 'cinema 4d', 'vba', 'argumentation', 'e-commerce', 'mental fortitude', 'procurement', 'geometric dimensioning', 'sourcing candidates', 'dimensioning and tolerancing', 'identify root causes', 'camera settings', 'ssh tunneling', 'communicating over rpc', 'sketching', 'values alignment', 'schematic capture', 'goal achievement', 'photoshop rendering', 'exploit vulnerabilities', 'websockets', 'authorization', 'habit formation',

The new clusters:

In [None]:
clustered_skills = copy_list.copy()
for cluster_id, skills_in_cluster in enumerate(clustered_skills):
    print(f"Cluster {cluster_id}:")
    for skill in skills_in_cluster:
        print(skill)

    print()

# Find the index of the list with the maximum length
new_max_index = max(range(len(clustered_skills)), key=lambda i: len(clustered_skills[i]))
print(new_max_index)
print(len(clustered_skills[new_max_index]))

Cluster 0:
parsing protobuf messages
sending messages
rigging
programmatic advertising
unlearn silence
servlets
facial retouching
rpm distribution
merge conflict resolution
conflict resolution
bias mitigation
logical reasoning
attributes
asking questions
organizational alignment
foster inclusivity
uncovering unconscious bias
generating ideas
mindset
bookkeeping
laser cutting
messaging with jms
focus
lombok annotations
linear algebra
enhanced concentration
trauma-informed care
recognize hazards
cinema 4d
vba
argumentation
e-commerce
mental fortitude
procurement
geometric dimensioning
sourcing candidates
dimensioning and tolerancing
identify root causes
camera settings
ssh tunneling
communicating over rpc
sketching
values alignment
schematic capture
goal achievement
photoshop rendering
exploit vulnerabilities
websockets
authorization
habit formation
pulse width modulation
camera mounting
enhance c#
workplace inclusivity
xml parsing
pitch effectively
guitar scales
requirements elicitation

# Giving a Name to each Cluster with Gemini API:

In [None]:

import time
model = genai.GenerativeModel('gemini-pro')
clusters = {}
for cluster_id, skills_in_cluster in enumerate(clustered_skills):
    prompt = "name this skill cluster, give unique name:" + str(skills_in_cluster)
    response = model.generate_content(
        prompt,
        generation_config=genai.types.GenerationConfig(
            # Only one candidate for now.
            candidate_count=1,
            temperature=0.7
        )
    )
    time.sleep(1.5)
    clusters[cluster_id] = response.parts[0].text


Handling Duplicate Names (if exist):

In [None]:
def print_duplicates(lst):
    seen = set()
    duplicates = set()

    for index,item in enumerate(lst):
        if item in seen:
            duplicates.add((item,index))
        else:
            seen.add(item)

    for dup in duplicates:
      prompt = f"give this skill cluster: {clustered_skills[dup[1]]}, a different name than:"+str(dup[0])
      response = model.generate_content(
        prompt,
        generation_config=genai.types.GenerationConfig(
            # Only one candidate for now.
            candidate_count=1,
            temperature=0.7
        )
    )
    time.sleep(1.5)
    clusters[dup[1]] = response.parts[0].text



# Example usage:

print_duplicates(list(clusters.values()))




In [None]:
for key,val in clusters.items():
    clusters[key]=val.replace('*', '')


149


In [None]:
print(copy_list)
print(clusters)

[['parsing protobuf messages', 'sending messages', 'rigging', 'programmatic advertising', 'unlearn silence', 'servlets', 'facial retouching', 'rpm distribution', 'merge conflict resolution', 'conflict resolution', 'bias mitigation', 'logical reasoning', 'attributes', 'asking questions', 'organizational alignment', 'foster inclusivity', 'uncovering unconscious bias', 'generating ideas', 'mindset', 'bookkeeping', 'laser cutting', 'messaging with jms', 'focus', 'lombok annotations', 'linear algebra', 'enhanced concentration', 'trauma-informed care', 'recognize hazards', 'cinema 4d', 'vba', 'argumentation', 'e-commerce', 'mental fortitude', 'procurement', 'geometric dimensioning', 'sourcing candidates', 'dimensioning and tolerancing', 'identify root causes', 'camera settings', 'ssh tunneling', 'communicating over rpc', 'sketching', 'values alignment', 'schematic capture', 'goal achievement', 'photoshop rendering', 'exploit vulnerabilities', 'websockets', 'authorization', 'habit formation',

In [None]:
cluster_dict ={}
for index,cluster in enumerate(copy_list):

  cluster_dict[clusters[index]] = cluster

print(cluster_dict)

{'Technical Proficiency': ['parsing protobuf messages', 'sending messages', 'rigging', 'programmatic advertising', 'unlearn silence', 'servlets', 'facial retouching', 'rpm distribution', 'merge conflict resolution', 'conflict resolution', 'bias mitigation', 'logical reasoning', 'attributes', 'asking questions', 'organizational alignment', 'foster inclusivity', 'uncovering unconscious bias', 'generating ideas', 'mindset', 'bookkeeping', 'laser cutting', 'messaging with jms', 'focus', 'lombok annotations', 'linear algebra', 'enhanced concentration', 'trauma-informed care', 'recognize hazards', 'cinema 4d', 'vba', 'argumentation', 'e-commerce', 'mental fortitude', 'procurement', 'geometric dimensioning', 'sourcing candidates', 'dimensioning and tolerancing', 'identify root causes', 'camera settings', 'ssh tunneling', 'communicating over rpc', 'sketching', 'values alignment', 'schematic capture', 'goal achievement', 'photoshop rendering', 'exploit vulnerabilities', 'websockets', 'authoriza

Saving Clusters to CSV:

In [None]:
def dict_to_csv(dictionary, filename):
    max_length = max(len(lst) for lst in dictionary.values())
    df = pd.DataFrame({k: pd.Series(v[:max_length]) for k, v in dictionary.items()})
    df.to_csv(filename, index=False)

dict_to_csv(cluster_dict,'all_clustered_skills.csv')

Downloading Clusters from CSV:

In [None]:

# Load CSV file into DataFrame
df = pd.read_csv('all_clustered_skills.csv')

# Initialize clustered_skills as a list of empty lists
num_clusters = len(df.columns)
clustered_skills = [[] for _ in range(num_clusters)]

# Iterate over each column (cluster) in the DataFrame
for cluster_id, column_name in enumerate(df.columns):
    cleaned_list = [x for x in df[column_name].tolist() if not pd.isna(x)]
    clustered_skills[cluster_id] = [column_name,cleaned_list]
# Create a dictionary to store clusters
clustered_dict = {cluster[0]: cluster[1] for cluster in clustered_skills}
print(len(clustered_dict))
#print(clustered_dict)

In [None]:

skill_dict = {}
for key, values in clustered_dict.items():
    for value in values:
        skill_dict[value] = key

print(skill_dict)
print(skill_dict["data strategy"])

{'parsing protobuf messages': 'Technical Proficiency', 'sending messages': 'Technical Proficiency', 'rigging': 'Technical Proficiency', 'programmatic advertising': 'Technical Proficiency', 'unlearn silence': 'Technical Proficiency', 'servlets': 'Technical Proficiency', 'facial retouching': 'Technical Proficiency', 'rpm distribution': 'Technical Proficiency', 'merge conflict resolution': 'Technical Proficiency', 'conflict resolution': 'Technical Proficiency', 'bias mitigation': 'Technical Proficiency', 'logical reasoning': 'Technical Proficiency', 'attributes': 'Technical Proficiency', 'asking questions': 'Technical Proficiency', 'organizational alignment': 'Technical Proficiency', 'foster inclusivity': 'Technical Proficiency', 'uncovering unconscious bias': 'Technical Proficiency', 'generating ideas': 'Technical Proficiency', 'mindset': 'Technical Proficiency', 'bookkeeping': 'Technical Proficiency', 'laser cutting': 'Technical Proficiency', 'messaging with jms': 'Technical Proficiency

# Cleaning the Degree Field

In [None]:

def split_and_clean_string(string):
    processed_strings = []
    #for string in given_string:
      # Lowercase the string
    string = string.lower()
      # Delete all digits
    string = re.sub(r'\d', '', string)
    #string = re.sub(r'[^\w\s,]', '', string)
      # Delete "gpa" and "%" substrings
    string = string.replace('gpa', '').replace('%', '').replace("\\", "").replace(".", "").replace("/", "").replace(":", "").replace("*", "").replace("&", "")
      #processed_strings.append(string)
    return eval(string)

# Apply the function to the column of strings using apply
model_df['degree_field'] = model_df['degree_field'].apply(lambda x: split_and_clean_string(x))

In [None]:
df = pd.read_csv('all_clustered_degree.csv')

# Initialize clustered_skills as a list of empty lists
num_clusters = len(df.columns)
clustered_degree = [[] for _ in range(num_clusters)]

# Iterate over each column (cluster) in the DataFrame
for cluster_id, column_name in enumerate(df.columns):
    cleaned_list = [x for x in df[column_name].tolist() if not pd.isna(x)]
    clustered_degree[cluster_id] = [column_name,cleaned_list]
# Create a dictionary to store clusters
clustered_degree_dict = {cluster[0]: cluster[1] for cluster in clustered_degree}
print(len(clustered_degree_dict))
#print(clustered_dict)

In [None]:
degree_dict = {}
for key, values in clustered_degree_dict.items():
    for value in values:
        degree_dict[value] = key

print(degree_dict["electrical and electronics engineering"])

def transform_list(lst):
    return [degree_dict.get(item,item) for item in lst]

# Apply the function to create the new column
model_df['degree_field'] = model_df['degree_field'].apply(lambda x: transform_list(x))

# Performing Clustering on Degree Field:

In [None]:

random.seed(42)

deg1 = model_df['degree_field'].tolist()
flat_deg1 = [item for sublist in deg1 for item in sublist]
print(flat_deg1)
unique_values = list(set(flat_deg1))
unique_values = [value for value in unique_values if value is not None]
print(unique_values)
vectorizer = TfidfVectorizer(max_df=0.4, min_df=2, lowercase=True)
X = vectorizer.fit_transform(unique_values)

# Step 3: Clustering
num_clusters = 150  # Adjust the number of clusters as needed
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=200, n_init=20)
kmeans.fit(X)
# Step 4: Interpretation and printing clusters
cluster_labels = kmeans.labels_  # Get cluster labels for each data point

clustered_degree = [[] for _ in range(num_clusters)]
for deg, label in zip(unique_values, cluster_labels):
    clustered_degree[label].append(deg)

# Print each cluster separately
for cluster_id, deg_in_cluster in enumerate(clustered_degree):
    print(f"Cluster {cluster_id}:")
    for deg in deg_in_cluster:
        print(deg)

    print()

['business analytics', 'electrical and electronics engineering', 'electrical and electronics engineering', 'mechatronics, robotics, and automation engineering', 'computer science, minor in mathematics', 'management information systems, general', 'computer science', 'information science and engineering', 'electrical and computer engineering', 'mathematics', 'statistics', 'industrial engineering', 'electrical and computer engineering', 'computer science', 'attended as part of the congress-bundestag youth exchange', 'computer speech and language', 'computer science', 'computer science', 'computer science', 'computer engineering', 'electrical enginneering', 'tele-communication engineering', 'theoretical physics', 'theoretical physics', 'e-commerce engineering', 'electrical and electronics engineering', 'electrical engineering', 'logistics technology', 'computer engineering', 'computer science', 'information technology engineering', 'computer science', 'computer science', 'computer science'

Re-Clustering Largest Cluster:

In [None]:
# Find the index of the list with the maximum length
max_index = max(range(len(clustered_degree)), key=lambda i: len(clustered_degree[i]))

# Print the index of the list with the maximum length
print("Index of cluster with maximum length:", max_index)

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Identify the cluster you want to redistribute
cluster_to_redistribute = max_index  # Adjust the cluster ID as needed

# Extract skills from the identified cluster
degree_to_redistribute = clustered_degree[cluster_to_redistribute]
#print(len(skills_to_redistribute))
# Vectorize all skills in the clusters
cluster_vectorized = {}
for index, cluster in enumerate(clustered_degree):
    if index != cluster_to_redistribute:
        cluster_vectorized[index] = vectorizer.transform(cluster)

# Initialize dictionary to store transformed skills
degree_transformed_dict = {}

for i, deg in enumerate(degree_to_redistribute):
    X = vectorizer.transform([deg])
    max_sim = -1
    max_cluster = None
    for index, cluster in enumerate(clustered_degree):
        if index != cluster_to_redistribute:
           X_cluster = cluster_vectorized[index]
           sim = cosine_similarity(X, X_cluster).mean()
           if sim > max_sim:
               max_sim = sim
               max_cluster = index
    degree_transformed_dict[deg] = max_cluster




Index of cluster with maximum length: 0


In [None]:
copy_list = clustered_degree.copy()
for deg, cluster in degree_transformed_dict.items():
  copy_list[cluster].append(deg)

copy_list = [cluster for i,cluster in enumerate(copy_list) if i!=max_index]


print(copy_list)

clustered_degree = copy_list.copy()
for cluster_id, deg_in_cluster in enumerate(clustered_degree):
    print(f"Cluster {cluster_id}:")
    for deg in deg_in_cluster:
        print(deg)

    print()

# Find the index of the list with the maximum length
new_max_index = max(range(len(clustered_degree)), key=lambda i: len(clustered_degree[i]))
print(new_max_index)
print(len(clustered_degree[new_max_index]))

[['electrical and communications engineering', 'broadcast communications', 'strategic marketing communications', 'cisco unified communications', 'electrical, electronic and communications engineering technologytechnician', 'literature and communications', 'network communications and management', 'electrical, electronic and communications engineering', '', 'na', 'spécialité physique', 'physik', 'radiologic technologyscience - radiographer', 'infantry', 'hvacr', 'a+', 'ib', ' ( courses)', 'classics', 'realtime d reconstruction', 'ingenieria en sistemas de informacion', 'certified installer  tuner', 'emergency care attendant (emt ambulance)', 'informatyka', '-', 'visiting student', 'hypokhâgne, khâgne', 'mathsscience', 'cinema', ' ', 'prawo', 'eie', 'apprentissage statistique', 'sociology', 'hs', 'pcmb', 'a', 'criminology', 'medieninformatik', 'paralegal', 'eee', 'registered nursingregistered nurse', 'fire sciencefire-fighting', '電機工程學系', 'private pilot', 'nsw', 'nanomicrofabrication, nan

Naming Degree Clusters with API:

In [None]:
model = genai.GenerativeModel('gemini-pro')
clusters_degree = {}
for cluster_id, deg_in_cluster in enumerate(clustered_degree):
    prompt = "name this degree cluster, give unique name:" + str(deg_in_cluster)
    response = model.generate_content(
        prompt,
        generation_config=genai.types.GenerationConfig(
            # Only one candidate for now.
            candidate_count=1,
            temperature=0.7
        )
    )
    time.sleep(1.5)
    clusters_degree[cluster_id] = response.parts[0].text


Re-Naming if there are duplicate names:

In [None]:
def print_duplicates(lst):
    seen = set()
    duplicates = set()

    for index,item in enumerate(lst):
        if item in seen:
            duplicates.add((item,index))
        else:
            seen.add(item)

    for dup in duplicates:
      prompt = f"give this degree cluster: {clustered_degree[dup[1]]}, a different name than:"+str(dup[0])
      response = model.generate_content(
        prompt,
        generation_config=genai.types.GenerationConfig(
            # Only one candidate for now.
            candidate_count=1,
            temperature=0.7
        )
    )
    time.sleep(1.5)
    clusters_degree[dup[1]] = response.parts[0].text





print_duplicates(list(clusters_degree.values()))


for key,val in clusters_degree.items():
    clusters_degree[key]=val.replace('*', '')

clustered_degree_dict ={}
for index,cluster in enumerate(copy_list):
  clustered_degree_dict[clusters_degree[index]] = cluster


degree_dict = {}
for key, values in clustered_degree_dict.items():
    for value in values:
        degree_dict[value] = key

Saving Degree Clusters to CSV:

In [None]:
def dict_to_csv(dictionary, filename):
    max_length = max(len(lst) for lst in dictionary.values())
    df = pd.DataFrame({k: pd.Series(v[:max_length]) for k, v in dictionary.items()})
    df.to_csv(filename, index=False)

dict_to_csv(clustered_degree_dict,'all_clustered_degree.csv')


Loading CSV to Dict:

In [None]:

# Load CSV file into DataFrame
df = pd.read_csv('all_clustered_degree.csv')

# Initialize clustered_skills as a list of empty lists
num_clusters = len(df.columns)
clustered_degree = [[] for _ in range(num_clusters)]

# Iterate over each column (cluster) in the DataFrame
for cluster_id, column_name in enumerate(df.columns):
    cleaned_list = [x for x in df[column_name].tolist() if not pd.isna(x)]
    clustered_degree[cluster_id] = [column_name,cleaned_list]
# Create a dictionary to store clusters
clustered_degree_dict = {cluster[0]: cluster[1] for cluster in clustered_degree}
print(len(clustered_degree_dict))
#print(clustered_dict)

Mapping Degree Fields:

In [None]:
degree_dict = {}
for key, values in clustered_degree_dict.items():
    for value in values:
        degree_dict[value] = key

def transform_list(lst):
    return [degree_dict.get(item,item) for item in lst]

# Apply the function to create the new column
model_df['degree_field'] = model_df['degree_field'].apply(lambda x: transform_list(x))

mapping skills to skill_cluster name

In [None]:
def map_skills(skill_string):
    skill_descriptions = skill_string.split(' ')
    skill_list = skill_descriptions[1::2]
    mapped_skills = []
    for skill in skill_list:
      mapped_skill = skill_dict.get(skill,skill)
      if mapped_skill:
          if ":" not in mapped_skill: mapped_skills.append(mapped_skill)
    return mapped_skills

# Apply the mapping function to the "skills" column to create a new column
model_df['skills'] = model_df['skills'].apply(map_skills)
#model_df['skills'] = model_df['skills'].apply(lambda x: x.split(", "))


cleaning courses_title

In [None]:
stop_words = ["the","of",'to',"and","for","in","i","ii","iii","ap","&","advanced"]

def remove_stop_words(sentence):
    # Tokenize the sentence into words
    words = sentence.split()
    # Remove stop words
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Reconstruct the sentence without stop words
    filtered_sentence = ' '.join(filtered_words)
    return filtered_sentence


def preprocess_string(string):
    # Lowercase the string
    string = string.lower()
    # Split the string into a list
    string=string.replace("'", "")
    processed_list = [elem.strip() for elem in string.strip("[]").split(",")]
    # Remove stop words
    processed_list = [remove_stop_words(sentence) for sentence in processed_list]
    return processed_list

# Apply the function to the column of strings

model_df['courses_title'] = model_df['courses_title'].apply(preprocess_string)
#test = model_df['courses_title'].iloc[:10]
#print(test[6])
