### SMA Lab 5: Clustering the job titles of LinkedIn Connections using Greedy Heuristic Algorithm

In [1]:
pip install scikit-learn pandas

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd

In [4]:
df = pd.read_csv('JConnections.csv')

In [8]:
df.head()

Unnamed: 0,First Name,Last Name,URL,Unnamed: 3,Company,Position,Connected On
0,ABISHEK,N,https://www.linkedin.com/in/abishek-n-687602218,,,,27-Jul-23
1,quini,inisha,https://www.linkedin.com/in/quini-inisha-98156...,,,,27-Jul-23
2,Annapoornima,S,https://www.linkedin.com/in/annapoornima-s-5b5...,,,,24-Jul-23
3,Raj,Ga,https://www.linkedin.com/in/raj-ga-326979137,,IBM,Systems Engineer,22-Jul-23
4,Alagar,raja,https://www.linkedin.com/in/alagar-raja001,alagarraja9894@gmail.com,KRISH COMPUTER,System Operator,20-Jul-23


In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Sample LinkedIn connections' job titles
linkedin_connections = [
    "Software Engineer",
    "Data Analyst",
    "Product Manager",
    "Software Developer",
    "Data Scientist",
    "Software Engineer",
    "Data Engineer",
    "Product Manager",
    "Data Analyst",
    "Data Scientist",
    "Product Manager",
    "Software Engineer",
    "Data Engineer",
    "Data Scientist"
]

# Function to calculate cosine similarity between job titles
def calculate_cosine_similarity(titles):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(titles)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    return similarity_matrix



#### Greedy Heuristic Algorithm for clustering job titles

In [10]:

def greedy_clustering(titles, threshold=0.5):
    clusters = []
    similarity_matrix = calculate_cosine_similarity(titles)

    # Sort titles by popularity (number of occurrences)
    title_counts = pd.Series(titles).value_counts()
    sorted_titles = title_counts.index.tolist()

    for title in sorted_titles:
        added_to_cluster = False

        # Calculate average similarity with each existing cluster
        for cluster in clusters:
            cluster_similarity = similarity_matrix[[titles.index(title)]][:, [titles.index(c) for c in cluster]].mean()
            if cluster_similarity >= threshold:
                cluster.append(title)
                added_to_cluster = True
                break

        # If not similar enough to any existing cluster, create a new cluster
        if not added_to_cluster:
            clusters.append([title])

    return clusters

# Example usage
clusters = greedy_clustering(linkedin_connections, threshold=0.4)
for i, cluster in enumerate(clusters):
    print(f"Cluster {i + 1}: {cluster}")


Cluster 1: ['Software Engineer', 'Data Engineer']
Cluster 2: ['Product Manager']
Cluster 3: ['Data Scientist']
Cluster 4: ['Data Analyst']
Cluster 5: ['Software Developer']
