# Job Classifiction

In [63]:
import pandas as pd
import numpy as np
import re

In [None]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.multiclass import unique_labels
from sklearn.svm import LinearSVC
import nltk
from nltk.corpus import stopwords
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity


In [65]:
jobs = pd.read_csv("./Job_Posts.csv") 


nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'<.*?>', ' ', text) 
    text = re.sub(r'[^a-zA-Z]', ' ', text) 
    text = text.lower()
    text = " ".join([word for word in text.split() if word not in STOPWORDS])
    return text.strip()

jobs['job_description_clean'] = jobs['job_description'].apply(clean_text)
jobs = jobs.dropna(subset=['job_category1', 'job_description_clean'])


jobs.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zishan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,city,job_title,job_category1,job_category2,job_category3,job_industry1,job_industry2,job_industry3,salary_minimum,...,num_vacancies,career_level,experience_years,post_date,views,job_description,job_requirements,payment_period,currency,job_description_clean
0,516e4ed,Ciro,Sales & Marketing Agent,Sales/Retail/Business Development,Marketing,Select,Telecommunications Services,Select,Select,2000,...,8,Entry Level,0-1,2014-01-01 06:01:41,2602,<p><strong>Qualifications</strong>:<br /><br /...,,Per Month,Egyptian Pound,qualifications bull graduates undergraduates e...
1,a361ef59,Cairo,German Training Coordinator,Customer Service/Support,Administration,Human Resources,Translation and Localization,Business Services - Other,Education,1000,...,8,Entry Level,0-2,2014-01-01 20:01:18,2213,<p>&bull;Placing jobs' ads on various websites...,,Per Month,Egyptian Pound,bull placing jobs ads various websites purpose...
2,7226ce78,Cairo,Junior Software Developer,IT/Software Development,Select,Select,Computer Software,Select,Select,2000,...,1,Entry Level,2,2014-01-02 11:01:03,2940,"<span style=""text-decoration: underline;""><str...",,Per Month,Egyptian Pound,job summary bull software developers brains be...
3,f4b2bcd6,Cairo,Application Support Engineer,IT/Software Development,Select,Select,Telecommunications Services,Select,Select,2000,...,1,Entry Level,1-2,2014-01-02 12:01:23,2042,"<strong><span style=""text-decoration: underlin...",,Per Month,Egyptian Pound,position experience skills required one two ye...
4,3fee6f73,Alexandria,Electrical Maintenance Engineer,Engineering,Select,Select,Food and Beverage Production,Select,Select,5000,...,1,Experienced (Non-Manager),1-3,2014-01-21 13:45:56,5684,Job Title: Electrical Maintenance Engineer<br ...,,Per Month,Egyptian Pound,job title electrical maintenance engineer loca...


In [66]:
label_encoder = LabelEncoder()
jobs['job_category_encoded'] = label_encoder.fit_transform(jobs['job_category1'])

X_train, X_test, y_train, y_test = train_test_split(jobs['job_description_clean'], jobs['job_category_encoded'], test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [67]:
model = LinearSVC()
model.fit(X_train_vec, y_train)

In [68]:
y_pred = model.predict(X_test_vec)

labels_used = unique_labels(y_test, y_pred)
print(classification_report(y_test, y_pred, labels=labels_used, target_names=label_encoder.inverse_transform(labels_used)))

                                      precision    recall  f1-score   support

        Accounting/Finance/Insurance       0.25      0.14      0.18         7
                      Administration       0.64      0.55      0.59        93
                             Banking       1.00      0.36      0.53        11
                 Biotech/R&D/Science       0.78      0.35      0.48        20
Building Construction/Skilled Trades       0.00      0.00      0.00        13
                           Business        0.60      0.25      0.35        12
                     Creative/Design       0.82      0.81      0.82       358
            Customer Service/Support       0.83      0.82      0.83       564
                   Editorial/Writing       0.87      0.78      0.82        92
                  Education/Training       0.58      0.55      0.57        38
                         Engineering       0.65      0.80      0.72       650
                             Fashion       1.00      0.33      

### Predicts the job category by reading the job description

In [69]:
def predict_category(text):
    text_clean = clean_text(text)
    vec = vectorizer.transform([text_clean])
    pred = model.predict(vec)
    return label_encoder.inverse_transform(pred)[0]

# Example
print(predict_category("Looking for someone with strong python skills."))

IT/Software Development


# Job recommendations

In [None]:
jobs['combined_features'] = jobs['job_title'].fillna('') + ' ' + jobs['job_description'].fillna('')
jobs['combined_features_clean'] = jobs['combined_features'].apply(clean_text)

job_vectorizer = TfidfVectorizer(max_features=5000)
job_vectors = job_vectorizer.fit_transform(jobs['combined_features_clean'])

In [None]:
def recommend_jobs(user_profile_text, top_n=5):
    user_clean = clean_text(user_profile_text)
    user_vec = job_vectorizer.transform([user_clean])

    similarity_scores = cosine_similarity(user_vec, job_vectors).flatten()

    top_indices = similarity_scores.argsort()[::-1][:top_n]
    recommended_jobs = jobs.iloc[top_indices][['job_title', 'job_description', 'job_category1']]
    recommended_jobs['similarity'] = similarity_scores[top_indices]
    
    return recommended_jobs

### Recommends 5 jobs

In [None]:
# Example
user_input = """
Skilled in Python, SQL, and data analysis. Experience in building machine learning models. 
Looking for roles in software development or data science.
"""

recommendations = recommend_jobs(user_input, top_n=5)
print(recommendations[['job_title', 'job_description', 'similarity']].to_string(index=False))