In [16]:
import pandas as pd

In [17]:
df = pd.read_csv('job_descriptions.csv')

In [18]:
df.head(5)

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


In [19]:
df.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [20]:
df.isnull().sum()

Job Id                 0
Experience             0
Qualifications         0
Salary Range           0
location               0
Country                0
latitude               0
longitude              0
Work Type              0
Company Size           0
Job Posting Date       0
Preference             0
Contact Person         0
Contact                0
Job Title              0
Role                   0
Job Portal             0
Job Description        0
Benefits               0
skills                 0
Responsibilities       0
Company                0
Company Profile     5478
dtype: int64

In [21]:
df.drop(columns=['Company Profile'],inplace=True)
df.drop(columns=['Contact'],inplace=True)
df.drop(columns=['Contact Person'],inplace=True)
df.drop(columns=['Job Id'],inplace=True)
df.drop(columns=['Job Posting Date'],inplace=True)
columns_to_drop = ['latitude', 'longitude', 'Job Portal']
df.drop(columns=columns_to_drop, inplace=True)

In [7]:
df.columns

Index(['Experience', 'Qualifications', 'Salary Range', 'location', 'Country',
       'Work Type', 'Company Size', 'Preference', 'Job Title', 'Role',
       'Job Description', 'Benefits', 'skills', 'Responsibilities', 'Company'],
      dtype='object')

In [22]:
df[['Min Salary', 'Max Salary']] = df['Salary Range'].str.extract(r'\$?(\d+)[kK]-\$?(\d+)[kK]')

# Convert extracted values to numeric and multiply by 1000
df['Min Salary'] = pd.to_numeric(df['Min Salary']) * 1000
df['Max Salary'] = pd.to_numeric(df['Max Salary']) * 1000

# Optionally, you can create an average salary column
df['Average Salary'] = (df['Min Salary'] + df['Max Salary']) / 2

In [23]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
col = ['Experience', 'Qualifications', 'location', 'Country',
       'Work Type', 'Company Size', 'Preference', 'Job Title', 
       'Role', 'Benefits', 'skills', 
       'Responsibilities', 'Company']

encoding_dict={}

# Loop through each categorical column and apply LabelEncoder
for c in col:
    df[c] = le.fit_transform(df[c].astype(str))  
    encoding_dict[c] = {index: label for index, label in enumerate(le.classes_)}

In [24]:
encoding_dict.keys()

dict_keys(['Experience', 'Qualifications', 'location', 'Country', 'Work Type', 'Company Size', 'Preference', 'Job Title', 'Role', 'Benefits', 'skills', 'Responsibilities', 'Company'])

In [26]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Sample job descriptions
job_descriptions = df['Job Description'].tolist()

# Print original job descriptions
# print("Original Job Descriptions:")
# print(job_descriptions)

def preprocess_text(text):
    # Ensure that the input is a string
    if not isinstance(text, str):
        return ""
    
    # Lowercasing
    text = text.lower()
    
    # Removing non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = text.split()
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Debugging: Print tokens after removing stopwords
    # print(f"Tokens after stopword removal: {tokens}")  # Debugging line
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Return the processed text
    processed_text = ' '.join(tokens)

    # Debugging: Print processed text for each job description
    # print(f"Processed text: '{processed_text}'")  # Debugging line

    return processed_text

# Apply the preprocessing to the 'Job Description' column
job_descriptions = df['Job Description'].apply(preprocess_text).tolist()

# Filter out empty descriptions
job_descriptions = [desc for desc in job_descriptions if desc]  # Keep only non-empty descriptions

# Print the final job descriptions after preprocessing
print("Final Job Descriptions after Preprocessing:")
print(job_descriptions)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



NameError: name 'TfidfVectorizer' is not defined

In [30]:
len(job_descriptions)

1615940

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Vectorize the job descriptions
if job_descriptions:  # Check if there are any valid job descriptions
    vectorizer = TfidfVectorizer()
    job_vectors = vectorizer.fit_transform(job_descriptions)
else:
    print("No valid job descriptions to vectorize.")

Experience            int64
Qualifications        int64
Salary Range         object
location              int64
Country               int64
Work Type             int64
Company Size          int64
Preference            int64
Job Title             int64
Role                  int64
Job Description       int64
Benefits              int64
skills                int64
Responsibilities      int64
Company               int64
Min Salary            int64
Max Salary            int64
Average Salary      float64
dtype: object

In [67]:
# vectorise all job descriptions
vectorizer = TfidfVectorizer()
vectorized_data = vectorizer.fit_transform(job_descriptions)

In [74]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def match_job_description(user_input):
    print(user_input)
    # Preprocess the user input
    user_input_processed = preprocess_text(user_input)
    
    # Vectorize the user input
    user_vector = vectorizer.transform([user_input_processed])
    # Compute cosine similarity with job descriptions
    similarities = cosine_similarity(user_vector, vectorized_data)
    
    # Get the index of the most similar job description
    most_similar_idx = similarities.argmax()
    print("most_similar_idx", most_similar_idx)
    
    # Return the corresponding job title
    return df['Job Title'].iloc[most_similar_idx]

# Example usage
user_input = "software development"
matched_job = match_job_description(user_input)

print(f"Best matched job: {matched_job}")
print('Job title',encoding_dict['Job Title'][matched_job])

software development
most_similar_idx 180
Best matched job: 110
Job title Quality Assurance Analyst


In [63]:
df['Job Title'][49]

61

In [66]:
encoding_dict['Job Title'][49]

'Front-End Engineer'

In [50]:
encoding_dict.keys()

dict_keys(['Experience', 'Qualifications', 'location', 'Country', 'Work Type', 'Company Size', 'Preference', 'Job Title', 'Role', 'Benefits', 'skills', 'Responsibilities', 'Company'])

In [75]:
df_test = pd.read_csv('job_descriptions.csv')

In [56]:
df_test.head(5)

Unnamed: 0,Job Id,Experience,Qualifications,Salary Range,location,Country,latitude,longitude,Work Type,Company Size,...,Contact,Job Title,Role,Job Portal,Job Description,Benefits,skills,Responsibilities,Company,Company Profile
0,1089843540111562,5 to 15 Years,M.Tech,$59K-$99K,Douglas,Isle of Man,54.2361,-4.5481,Intern,26801,...,001-381-930-7517x737,Digital Marketing Specialist,Social Media Manager,Snagajob,Social Media Managers oversee an organizations...,"{'Flexible Spending Accounts (FSAs), Relocatio...","Social media platforms (e.g., Facebook, Twitte...","Manage and grow social media accounts, create ...",Icahn Enterprises,"{""Sector"":""Diversified"",""Industry"":""Diversifie..."
1,398454096642776,2 to 12 Years,BCA,$56K-$116K,Ashgabat,Turkmenistan,38.9697,59.5563,Intern,100340,...,461-509-4216,Web Developer,Frontend Web Developer,Idealist,Frontend Web Developers design and implement u...,"{'Health Insurance, Retirement Plans, Paid Tim...","HTML, CSS, JavaScript Frontend frameworks (e.g...","Design and code user interfaces for websites, ...",PNC Financial Services Group,"{""Sector"":""Financial Services"",""Industry"":""Com..."
2,481640072963533,0 to 12 Years,PhD,$61K-$104K,Macao,"Macao SAR, China",22.1987,113.5439,Temporary,84525,...,9687619505,Operations Manager,Quality Control Manager,Jobs2Careers,Quality Control Managers establish and enforce...,"{'Legal Assistance, Bonuses and Incentive Prog...",Quality control processes and methodologies St...,Establish and enforce quality control standard...,United Services Automobile Assn.,"{""Sector"":""Insurance"",""Industry"":""Insurance: P..."
3,688192671473044,4 to 11 Years,PhD,$65K-$91K,Porto-Novo,Benin,9.3077,2.3158,Full-Time,129896,...,+1-820-643-5431x47576,Network Engineer,Wireless Network Engineer,FlexJobs,"Wireless Network Engineers design, implement, ...","{'Transportation Benefits, Professional Develo...",Wireless network design and architecture Wi-Fi...,"Design, configure, and optimize wireless netwo...",Hess,"{""Sector"":""Energy"",""Industry"":""Mining, Crude-O..."
4,117057806156508,1 to 12 Years,MBA,$64K-$87K,Santiago,Chile,-35.6751,-71.5429,Intern,53944,...,343.975.4702x9340,Event Manager,Conference Manager,Jobs2Careers,A Conference Manager coordinates and manages c...,"{'Flexible Spending Accounts (FSAs), Relocatio...",Event planning Conference logistics Budget man...,Specialize in conference and convention planni...,Cairn Energy,"{""Sector"":""Energy"",""Industry"":""Energy - Oil & ..."


In [78]:
unique_names = df_test['Job Title'].unique()

In [79]:
len(unique_names)

147

In [82]:
df_test.shape

(1615940, 23)

In [83]:
len(df_test['skills'].unique())

376

In [85]:
# skills = df_test['skills'].tolist()

# Apply the preprocessing to the 'Job Description' column
skills_processed = df_test['skills'].apply(preprocess_text).tolist()

# Filter out empty descriptions
skills_processed = [desc for desc in skills_processed if desc]  # Keep only non-empty descriptions


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [87]:
len(skills_processed)

1615940

In [91]:
vectorizer_skills = TfidfVectorizer()
vectorized_data = vectorizer_skills.fit_transform(skills_processed)

In [95]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def match_skills(user_input):
    print(user_input)
    # Preprocess the user input
    user_input_processed = preprocess_text(user_input)
    
    # Vectorize the user input
    user_vector = vectorizer_skills.transform([user_input_processed])
    # Compute cosine similarity with job descriptions
    similarities = cosine_similarity(user_vector, vectorized_data)
    
    # Get the index of the most similar job description
    most_similar_idx = similarities.argmax()
    print("most_similar_idx", most_similar_idx)
    
    # Return the corresponding job title
    return df['Job Title'].iloc[most_similar_idx]

# Example usage
user_input = "Testing"
matched_job = match_skills(user_input)

print(f"Best matched job: {matched_job}")
print('Job title: ',encoding_dict['Job Title'][matched_job])

Testing
most_similar_idx 10
Best matched job: 108
Job title:  QA Analyst


In [86]:
df_test['skills'].unique()

array(['Social media platforms (e.g., Facebook, Twitter, Instagram) Content creation and scheduling Social media analytics and insights Community engagement Paid social advertising',
       'HTML, CSS, JavaScript Frontend frameworks (e.g., React, Angular) User experience (UX)',
       'Quality control processes and methodologies Statistical process control (SPC) Root cause analysis and corrective action Quality management systems (e.g., ISO 9001) Compliance and regulatory knowledge',
       'Wireless network design and architecture Wi-Fi standards and protocols RF (Radio Frequency) planning and optimization Wireless security protocols Troubleshooting wireless network issues',
       'Event planning Conference logistics Budget management Vendor coordination Marketing and promotion Client relations',
       'Quality assurance processes Testing methodologies (e.g., manual, automated) Bug tracking and reporting Test case development Regression testing',
       'Teaching pedagogy Classroom 