In [43]:
import pandas as pd 
import numpy as np 

In [44]:
df = pd.read_csv(r'C:\Users\hp\Downloads\NLP-JobPostings\data\data_jobs.csv')
df.head()

Unnamed: 0,text,label
0,BAGGER: This person's primary duties involve t...,Bagger
1,The Institute for Human Neuroscience at Boys T...,Agronomist
2,About the Company: District Medical Group (DMG...,Autopsy Assistant
3,The Butcher's Table is looking for a part-time...,Butcher
4,Job Details Job Location : Torrance GSM - TORR...,Autopsy Assistant


In [45]:
df["label"].unique()

array(['Bagger ', 'Agronomist ', 'Autopsy Assistant ', ...,
       'Seasonal Warehouse Associate', 'Ross', 'Histology'], dtype=object)

In [46]:
unique_labels_count = df['label'].nunique()
unique_labels_count

4431

In [47]:
# Strip whitespace from the labels
df['label'] = df['label'].str.strip()

# Recalculate the number of unique labels after cleaning
unique_labels_cleaned = df['label'].nunique()
print("Number of unique labels after cleaning:", unique_labels_cleaned)

Number of unique labels after cleaning: 4323


In [48]:
# Get the sorted list of unique labels
unique_labels_list = sorted(df['label'].unique())

# Display the list
for label in unique_labels_list:
    print(label)


2D animation
3D Animator
3D CAD
3D modeling
3PAR
4G
911 Dispatcher
911 Operator
A+
A/B testing
ABA Therapist
ACF2
ACS
ACT
ADA
ADP Payroll
AIM
ALM
AME
AML
ANSI C
AP Stylebook
APEX
API
API QA
APICS
APL
ARM
ARM Assembler
ARM architecture
AS/400 Control Language
ASA
ASM
ASP
ASP.NET AJAX
ASP.NET MVC
ATG Commerce
ATLAS
Aba
Able Seaman
Abm
Academic Advisor
Academic Staff Recruiter
Accompanist
Account Development Manager Inside Sales
Account Executive
Account Executive Asset Management
Account Executive Hospice
Account Executive Key Account
Account Executive Strategic
Account Manager
Account Manager Security Industry
Account Receivable Coordinator
Account Senior Analyst
Account Specialist Senior
Accountant
Accountant Ii, Financial Accounting
Accounting
Accounting Analyst
Accounting Assistant
Accounting Audit Manager
Accounting Clerk
Accounting Instructor
Accounting Intern
Accounting Manager
Accounting Operation Representative
Accounts Payable
Accounts Payable Clerk
Accounts Payable Specialist


In [49]:
import nltk

nltk.download('punkt_tab')      # Tokenizer
nltk.download('stopwords')  # Stopwords
nltk.download('wordnet')    # WordNet for lemmatization


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [50]:
from nltk.data import find

resources = ['tokenizers/punkt.zip', 'corpora/stopwords.zip', 'corpora/wordnet.zip']

for resource in resources:
    try:
        find(resource)
        print(f"{resource} is available.")
    except LookupError:
        print(f"{resource} is missing.")


tokenizers/punkt.zip is available.
corpora/stopwords.zip is available.
corpora/wordnet.zip is available.


In [51]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

df['tokens'] = df['text'].apply(preprocess_text)

print(df[['text', 'tokens']].head())


                                                text  \
0  BAGGER: This person's primary duties involve t...   
1  The Institute for Human Neuroscience at Boys T...   
2  About the Company: District Medical Group (DMG...   
3  The Butcher's Table is looking for a part-time...   
4  Job Details Job Location : Torrance GSM - TORR...   

                                              tokens  
0  [bagger, :, person, 's, primary, duty, involve...  
1  [institute, human, neuroscience, boy, town, na...  
2  [company, :, district, medical, group, (, dmg,...  
3  [butcher, 's, table, looking, part-time, food,...  
4  [job, detail, job, location, :, torrance, gsm,...  


In [52]:
df.head()

Unnamed: 0,text,label,tokens
0,BAGGER: This person's primary duties involve t...,Bagger,"[bagger, :, person, 's, primary, duty, involve..."
1,The Institute for Human Neuroscience at Boys T...,Agronomist,"[institute, human, neuroscience, boy, town, na..."
2,About the Company: District Medical Group (DMG...,Autopsy Assistant,"[company, :, district, medical, group, (, dmg,..."
3,The Butcher's Table is looking for a part-time...,Butcher,"[butcher, 's, table, looking, part-time, food,..."
4,Job Details Job Location : Torrance GSM - TORR...,Autopsy Assistant,"[job, detail, job, location, :, torrance, gsm,..."


In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib 

# Join tokens back into a single string for each row
df['processed_text'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))

tfidf_vectorizer = TfidfVectorizer(max_features=100)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.joblib")

# TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,000,ability,able,and,apply,based,benefit,business,care,career,...,system,team,technology,time,training,well,within,work,working,year
0,0.0,0.0,0.411824,0.0,0.0,0.204127,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.198815,0.0,0.473508,0.0,0.0
1,0.051575,0.177722,0.0,0.175974,0.140131,0.0,0.035516,0.0,0.141575,0.131074,...,0.091758,0.0,0.0,0.030658,0.156617,0.088125,0.131569,0.104941,0.080868,0.057546
2,0.0,0.0,0.0,0.0,0.071391,0.0,0.0,0.0,0.0,0.0,...,0.140241,0.0,0.0,0.046856,0.119685,0.067344,0.0,0.040098,0.0,0.043976
3,0.0,0.098841,0.0,0.0,0.0,0.0,0.098761,0.0,0.0,0.0,...,0.0,0.165647,0.0,0.170504,0.0,0.0,0.0,0.145909,0.0,0.080012
4,0.0,0.108027,0.023116,0.044568,0.0,0.0,0.0,0.045518,0.047809,0.0,...,0.023239,0.015087,0.0,0.023294,0.019833,0.022319,0.022215,0.093024,0.143369,0.0


In [54]:
from sklearn.model_selection import train_test_split

y = df['label']  

X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y, test_size=0.2, random_state=42)

# shapes of the splits
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)


Training set shape: (18148, 100)
Test set shape: (4538, 100)
Training labels shape: (18148,)
Test labels shape: (4538,)


In [55]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib 

model = LogisticRegression()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

joblib.dump(model, 'logistic_regression_model.joblib')

loaded_model = joblib.load('logistic_regression_model.joblib')


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                                          precision    recall  f1-score   support

                                                             3D Animator       0.00      0.00      0.00         1
                                                                    3PAR       0.00      0.00      0.00         1
                                                          911 Dispatcher       0.00      0.00      0.00         3
                                                            911 Operator       0.00      0.00      0.00         2
                                                             A/B testing       0.00      0.00      0.00         1
                                                           ABA Therapist       0.00      0.00      0.00         5
                                                                     ADA       0.00      0.00      0.00         3
                                                             ADP Payroll       0.00    