In [1]:
import pandas as pd 
import numpy as np

In [2]:
# empty list to hold the chunks
chunks = []

# to load the file in smaller chunks to avoid MemoryError
chunk_size = 1000  

# loading data in chunks
for chunk in pd.read_json(r'C:\Users\hp\Downloads\datasets\US Job Postings/techmap-jobs_us_2023-05-05.json', lines=True, chunksize=chunk_size):
    # only the needed columns only
    chunk = chunk[['name', 'text']].rename(columns={'name': 'label', 'text': 'job_description'})
    chunks.append(chunk)
# conc in df
df = pd.concat(chunks)
df.head()

Unnamed: 0,label,job_description
0,Account Manager,"At Legacy Risk, an Alera Group Company, our bu..."
1,"Music Teacher, Band Director",DescriptionWe are looking for music educators ...
2,Floral Clerk FT Chapel Hill NC,To provide excellent guest service by performi...
3,Audio and Video Transcription - Flexible Sched...,What exactly is a search engine evaluation job...
4,Sticker Production Assistant,We are a high-quality printing company operati...


In [3]:
df

Unnamed: 0,label,job_description
0,Account Manager,"At Legacy Risk, an Alera Group Company, our bu..."
1,"Music Teacher, Band Director",DescriptionWe are looking for music educators ...
2,Floral Clerk FT Chapel Hill NC,To provide excellent guest service by performi...
3,Audio and Video Transcription - Flexible Sched...,What exactly is a search engine evaluation job...
4,Sticker Production Assistant,We are a high-quality printing company operati...
...,...,...
33059,Westview Group Fitness Instructor,POSITION SUMMARY: This position supports the w...
33060,Groundskeeper and Maintenance,"Looking for an experienced groundskeeper, land..."
33061,Data Analyst for Marketing Performance & Desig...,Job Summary The Apps Engineering team is looki...
33062,Server/Banquet Server- Private Club,Tam-O-Shanter Country Club Job Description Pos...


In [4]:
unique_labels_count = df["label"].nunique()
print("Number of unique labels:", unique_labels_count)

Number of unique labels: 20349


In [5]:
import nltk

nltk.download('punkt_tab')      # Tokenizer
nltk.download('stopwords')  # Stopwords
nltk.download('wordnet')    # WordNet for lemmatization

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.data import find

resources = ['tokenizers/punkt.zip', 'corpora/stopwords.zip', 'corpora/wordnet.zip']

for resource in resources:
    try:
        find(resource)
        print(f"{resource} is available.")
    except LookupError:
        print(f"{resource} is missing.")

tokenizers/punkt.zip is available.
corpora/stopwords.zip is available.
corpora/wordnet.zip is available.


In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

df['tokens'] = df['job_description'].apply(preprocess_text)

print(df[['job_description', 'tokens']].head())


                                     job_description  \
0  At Legacy Risk, an Alera Group Company, our bu...   
1  DescriptionWe are looking for music educators ...   
2  To provide excellent guest service by performi...   
3  What exactly is a search engine evaluation job...   
4  We are a high-quality printing company operati...   

                                              tokens  
0  [legacy, risk, ,, alera, group, company, ,, bu...  
1  [descriptionwe, looking, music, educator, join...  
2  [provide, excellent, guest, service, performin...  
3  [exactly, search, engine, evaluation, job, ?, ...  
4  [high-quality, printing, company, operating, m...  


In [8]:
df.head()

Unnamed: 0,label,job_description,tokens
0,Account Manager,"At Legacy Risk, an Alera Group Company, our bu...","[legacy, risk, ,, alera, group, company, ,, bu..."
1,"Music Teacher, Band Director",DescriptionWe are looking for music educators ...,"[descriptionwe, looking, music, educator, join..."
2,Floral Clerk FT Chapel Hill NC,To provide excellent guest service by performi...,"[provide, excellent, guest, service, performin..."
3,Audio and Video Transcription - Flexible Sched...,What exactly is a search engine evaluation job...,"[exactly, search, engine, evaluation, job, ?, ..."
4,Sticker Production Assistant,We are a high-quality printing company operati...,"[high-quality, printing, company, operating, m..."


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib 

# Join tokens back into a single string for each row
df['processed_text'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))

tfidf_vectorizer = TfidfVectorizer(max_features=20000)

tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

joblib.dump(tfidf_vectorizer, "tfidf_vectorizer_better.joblib")

# TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,00,000,0000,00000,001,002,008,00a,00am,00p,...,öffentlichen,öpnv,única,über,übermittlung,übernahme,übernehmen,übernimmst,überwachung,üblichen
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.068489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
from sklearn.model_selection import train_test_split

y = df['label']  

X_train, X_test, y_train, y_test = train_test_split(tfidf_df, y, test_size=0.2, random_state=42)

# shapes of the splits
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)

Training set shape: (26451, 20000)
Test set shape: (6613, 20000)
Training labels shape: (26451,)
Test labels shape: (6613,)


In [12]:
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
import numpy as np
import joblib

# Define parameters
svd = TruncatedSVD(n_components=10000)
batch_size = 1000  # Choose an appropriate batch size

# svd on the full training data (or on a subset to avoid memory issues & difference in fitting)
svd.fit(X_train.astype(np.float32))

# Incrementally fit the classifier
clf = SGDClassifier(loss='log_loss', max_iter=1000)

# Process in batches
n_batches = int(np.ceil(X_train.shape[0] / batch_size))

for i in range(n_batches):
    start = i * batch_size
    end = (i + 1) * batch_size
    X_batch = X_train[start:end].astype(np.float32)
    y_batch = y_train[start:end]
    
    # Apply the already fitted SVD transformation on the batch
    X_svd = svd.transform(X_batch).astype(np.float32)
    
    # Incremental learning
    clf.partial_fit(X_svd, y_batch, classes=np.unique(y_train))

# Apply the same transformation and prediction on the test set
X_test_svd = svd.transform(X_test.astype(np.float32))
y_pred = clf.predict(X_test_svd)

# Classification report
print(classification_report(y_test, y_pred))

# Save the trained SVD and classifier
joblib.dump(svd, 'svd_model_better.joblib')
joblib.dump(clf, 'sgd_classifier_better.joblib')


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                                                                                                                                                           precision    recall  f1-score   support

                                                                                                                                 $17.25/hr | Healthcare Aide | START ASAP       0.00      0.00      0.00         2
                                                                                             ($.50 - $.80+ CPM) CDL Class A Truck Drivers Needed: Apply Now! - Now Hiring       0.00      0.00      0.00         1
                                                                                                          ($.50 - $.80+ CPM) CDL-A Tanker Truck Driving Jobs - Apply Now!       0.00      0.00      0.00         1
                                                                                              ($.50 - $.80+ CPM) Regional CDL-A Dry Van Truck Driving Jobs 

['sgd_classifier_better.joblib']