In [9]:
import pandas as pd

In [10]:
df = pd.read_csv('job_descriptions.csv')

In [11]:
df.columns

Index(['Job Id', 'Experience', 'Qualifications', 'Salary Range', 'location',
       'Country', 'latitude', 'longitude', 'Work Type', 'Company Size',
       'Job Posting Date', 'Preference', 'Contact Person', 'Contact',
       'Job Title', 'Role', 'Job Portal', 'Job Description', 'Benefits',
       'skills', 'Responsibilities', 'Company', 'Company Profile'],
      dtype='object')

In [12]:
df.drop(columns=['Company Profile'],inplace=True)
df.drop(columns=['Contact'],inplace=True)
df.drop(columns=['Contact Person'],inplace=True)
df.drop(columns=['Job Id'],inplace=True)
df.drop(columns=['Job Posting Date'],inplace=True)
columns_to_drop = ['latitude', 'longitude', 'Job Portal']
df.drop(columns=columns_to_drop, inplace=True)

In [13]:
df[['Min Salary', 'Max Salary']] = df['Salary Range'].str.extract(r'\$?(\d+)[kK]-\$?(\d+)[kK]')

# Convert extracted values to numeric and multiply by 1000
df['Min Salary'] = pd.to_numeric(df['Min Salary']) * 1000
df['Max Salary'] = pd.to_numeric(df['Max Salary']) * 1000

# Optionally, you can create an average salary column
df['Average Salary'] = (df['Min Salary'] + df['Max Salary']) / 2

In [49]:
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# col = ['Experience', 'Qualifications', 'location', 'Country',
#        'Work Type', 'Company Size', 'Preference', 'Job Title', 
#        'Role', 'Benefits', 
#        'Responsibilities', 'Company']

# encoding_dict={}

# # Loop through each categorical column and apply LabelEncoder
# for c in col:
#     df[c] = le.fit_transform(df[c].astype(str))  
#     encoding_dict[c] = {index: label for index, label in enumerate(le.classes_)}

In [14]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Sample job descriptions
job_descriptions = df['Job Description'].tolist()

# Print original job descriptions
# print("Original Job Descriptions:")
# print(job_descriptions)

def preprocess_text(text):
    # Ensure that the input is a string
    if not isinstance(text, str):
        return ""
    
    # Lowercasing
    text = text.lower()
    
    # Removing non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = text.split()
    
    # Removing stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Debugging: Print tokens after removing stopwords
    # print(f"Tokens after stopword removal: {tokens}")  # Debugging line
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Return the processed text
    processed_text = ' '.join(tokens)

    # Debugging: Print processed text for each job description
    # print(f"Processed text: '{processed_text}'")  # Debugging line

    return processed_text

# Apply the preprocessing to the 'Job Description' column
job_descriptions = df['Job Description'].apply(preprocess_text).tolist()

# Filter out empty descriptions
job_descriptions = [desc for desc in job_descriptions if desc]  # Keep only non-empty descriptions

# Print the final job descriptions after preprocessing
print("Final Job Descriptions after Preprocessing:")
print(len(job_descriptions))

Final Job Descriptions after Preprocessing:
1615940


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
# vectorise all job descriptions
vectorizer = TfidfVectorizer()
vectorized_data = vectorizer.fit_transform(job_descriptions)

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def match_job_description(user_input):
    print(user_input)
    # Preprocess the user input
    user_input_processed = preprocess_text(user_input)
    
    # Vectorize the user input
    user_vector = vectorizer.transform([user_input_processed])
    # Compute cosine similarity with job descriptions
    similarities = cosine_similarity(user_vector, vectorized_data)
    
    # Get the index of the most similar job description
    most_similar_idx = similarities.argmax()
    print("most_similar_idx", most_similar_idx)
    
    # Return the corresponding job title
    return df['Job Title'].iloc[most_similar_idx]

# Example usage
user_input = "front-end react developer"
matched_job = match_job_description(user_input)

print(f"Best matched job: {matched_job}")

front-end react developer
most_similar_idx 301
Best matched job: Front-End Engineer


In [34]:
skills = df['skills'].unique()
print(len(skills))
# Apply the preprocessing to the 'Job Description' column
skills_processed = df['skills'].apply(preprocess_text).tolist()
# skills_processed = [preprocess_text(skill) for skill in skills]

# Filter out empty descriptions
skills_processed = [desc for desc in skills_processed if desc]  # Keep only non-empty descriptions


376


In [35]:
len(skills_processed)

1615940

In [36]:
vectorizer_skills = TfidfVectorizer()
vectorized_data = vectorizer_skills.fit_transform(skills_processed)

In [42]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

def match_skills(user_input):
    print(user_input)
    # Preprocess the user input
    user_input_processed = preprocess_text(user_input)
    
    # Vectorize the user input
    user_vector = vectorizer_skills.transform([user_input_processed])
    # Compute cosine similarity with job descriptions
    similarities = cosine_similarity(user_vector, vectorized_data)
    
    # Get the index of the most similar job description
    most_similar_idx = similarities.argmax()
    print("most_similar_idx", most_similar_idx)
    # df.iloc[most_similar_idx],
    # Return the corresponding job title
    return df['Job Title'].iloc[most_similar_idx]

# Example usage
user_input = "web development"
matched_job = match_skills(user_input)

print(f"Best matched job: {matched_job}")
print('Job title: ', matched_job)

web development
most_similar_idx 29
Best matched job: UI Developer
Job title:  UI Developer
