In [24]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [25]:
# Load the dataset
dataset = pd.read_csv('../../data/raw/revisioned_first_set.csv')

In [26]:
dataset['Skills Required'] = dataset['Skills Required'].str.split(', ')

In [27]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(dataset['Job Role Title'])

In [28]:
user_input = input("Enter a job role: ")
exact_match = dataset[dataset['Job Role Title'].str.lower() == user_input.lower()]

In [29]:
if not exact_match.empty:
    print("Exact match found!")
    print(f"Job Role: {exact_match['Job Role Title'].values[0]}")
    print(f"Skills: {', '.join(exact_match['Skills Required'].values[0])}")
else:
    user_input_vector = tfidf_vectorizer.transform([user_input])
    similarities = cosine_similarity(user_input_vector, tfidf_matrix)
    threshold = 0.2

    similar_roles = dataset[similarities[0] > threshold]

    if not similar_roles.empty:
        print("Similar job roles found:")
        for index, row in similar_roles.iterrows():
            print(f"Job Role: {row['Job Role Title']}")
            print(f"Skills: {', '.join(row['Skills Required'])} \n")
    else:
        print("No matching job roles found.")

Similar job roles found:
Job Role: Data Analyst
Skills: SQL,  Excel,  Data Visualization,  Statistics,  Data Cleaning,  Tableau,  Power BI 

Job Role: Business Intelligence Analyst
Skills: SQL,  Tableau,  Power BI,  Data Visualization,  Business Analysis,  Data Reporting 

Job Role: Senior Data Analyst
Skills: SQL,  Excel,  Tableau,  Data Visualization,  Data Cleaning,  Data Analysis 

Job Role: Big Data Analyst
Skills: Hadoop,  Spark,  Python,  SQL,  Data Visualization,  ETL 

Job Role: Data Analyst II
Skills: SQL,  Excel,  Tableau,  Power BI,  Data Visualization,  Data Cleaning 

Job Role: Data Analyst
Skills: SQL,  Excel,  Data Visualization,  Statistical Analysis,  Python,  R,  Tableau,  Power BI,  Data Cleaning 

Job Role: Big Data Analyst
Skills: Data Analysis,  Hadoop,  Spark,  Python,  SQL,  Data Visualization,  Data Cleaning,  Statistical Analysis 

Job Role: Data Science Intern
Skills: Python,  R,  Data Analysis,  Statistical Analysis,  Data Visualization,  Machine Learning, 

In the above code, we used several concepts and libraries for the task of finding job roles based on user input and then displaying the combined skills for those roles. Here are the key concepts and libraries used:

1. **Pandas:** We used the Pandas library to load and manipulate the dataset stored in a CSV file. Pandas provides data structures and functions for data analysis and manipulation, making it easy to work with tabular data.

2. **NLP (Natural Language Processing):** NLP concepts are used for tokenizing and processing text data. In particular, we tokenized the job roles, user input, and skills to work with text-based information.

3. **TF-IDF (Term Frequency-Inverse Document Frequency):** We used TF-IDF vectorization to represent the job roles as numerical vectors. This technique is commonly used in text mining and information retrieval to convert text data into a numerical format for similarity calculations.

4. **Cosine Similarity:** Cosine similarity is a metric used to determine how similar two vectors (in our case, job roles) are in a high-dimensional space. We calculated the cosine similarity between job roles to identify similar job roles based on their textual content.

5. **Data Preprocessing:** We preprocessed the dataset by splitting the skills in each job role into a list. This allowed us to work with individual skills for each role.

6. **Matching and Filtering:** We matched user input against job roles based on common words or tokens. If at least one word from the user input matched with any word in a job role, we considered it a match.

7. **Thresholding:** We applied a similarity threshold to filter out job roles that are considered similar based on the cosine similarity scores.

8. **Data Structures:** We used Python dictionaries and sets to organize and store information. Dictionaries were used to map job roles to their corresponding skills, and sets were used to combine and store unique skills.

9. **Looping and Iteration:** We used loops to iterate through job roles and their similarities, as well as to iterate through tokens in the user input.

10. **Conditional Statements:** We used conditional statements to filter and display the results, such as showing the combined skills for matched job roles.

These concepts and libraries were combined to create a program that takes user input, finds matching job roles, identifies similar roles, and displays the combined skills for those roles.

In [30]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import linear_kernel

# dataset = pd.read_csv('../../data/raw/revisioned_first_set.csv')

# job_roles = dataset['Job Role Title'].tolist()
# skills = dataset['Skills Required'].tolist()

# job_skill_dict = {role: skill.split(", ") for role, skill in zip(job_roles, skills)}

# tfidf_vectorizer = TfidfVectorizer()
# tfidf_matrix = tfidf_vectorizer.fit_transform(job_roles)
# cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# similarity_threshold = 0.6

# similar_roles = {}
# for i, role in enumerate(job_roles):
#     similar_roles[role] = [job_roles[j] for j, sim in enumerate(cosine_sim[i]) if i != j and sim >= similarity_threshold]

# combined_skills = {}
# for role, similar in similar_roles.items():
#     combined_skills[role] = set(job_skill_dict[role])
#     for similar_role in similar:
#         combined_skills[role].update(job_skill_dict[similar_role])

# # user_input = "Data Scientist"
# user_input = input("Enter your job role: ")

# user_tokens = user_input.split()

# matched_roles = []

# for role in job_roles:
#     role_tokens = role.split()
#     for token in user_tokens:
#         if token in role_tokens:
#             matched_roles.append(role)
#             break

# if matched_roles:
#     for role in matched_roles:
#         print(f"Job Role: {role}")
#         print(f"Combined Skills: {', '.join(combined_skills[role])} \n")
# else:
#     print("No matching job roles found.")
