In [2]:
import numpy as np 
import pandas as pd

In [3]:
df = pd.read_csv(r'C:\Users\hp\Downloads\NLP-JobPostings\data\data_jobs.csv')
df.head()

Unnamed: 0,text,label
0,BAGGER: This person's primary duties involve t...,Bagger
1,The Institute for Human Neuroscience at Boys T...,Agronomist
2,About the Company: District Medical Group (DMG...,Autopsy Assistant
3,The Butcher's Table is looking for a part-time...,Butcher
4,Job Details Job Location : Torrance GSM - TORR...,Autopsy Assistant


In [4]:
df['label'] = df['label'].str.strip()

In [5]:
import nltk

nltk.download('punkt_tab')      # Tokenizer
nltk.download('stopwords')  # Stopwords
nltk.download('wordnet')    # WordNet for lemmatization

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
from nltk.data import find

resources = ['tokenizers/punkt.zip', 'corpora/stopwords.zip', 'corpora/wordnet.zip']

for resource in resources:
    try:
        find(resource)
        print(f"{resource} is available.")
    except LookupError:
        print(f"{resource} is missing.")


tokenizers/punkt.zip is available.
corpora/stopwords.zip is available.
corpora/wordnet.zip is available.


In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # Apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

df['tokens'] = df['text'].apply(preprocess_text)

print(df[['text', 'tokens']].head())


                                                text  \
0  BAGGER: This person's primary duties involve t...   
1  The Institute for Human Neuroscience at Boys T...   
2  About the Company: District Medical Group (DMG...   
3  The Butcher's Table is looking for a part-time...   
4  Job Details Job Location : Torrance GSM - TORR...   

                                              tokens  
0  [bagger, :, person, 's, primary, duty, involve...  
1  [institute, human, neuroscience, boy, town, na...  
2  [company, :, district, medical, group, (, dmg,...  
3  [butcher, 's, table, looking, part-time, food,...  
4  [job, detail, job, location, :, torrance, gsm,...  


In [8]:
df.head()

Unnamed: 0,text,label,tokens
0,BAGGER: This person's primary duties involve t...,Bagger,"[bagger, :, person, 's, primary, duty, involve..."
1,The Institute for Human Neuroscience at Boys T...,Agronomist,"[institute, human, neuroscience, boy, town, na..."
2,About the Company: District Medical Group (DMG...,Autopsy Assistant,"[company, :, district, medical, group, (, dmg,..."
3,The Butcher's Table is looking for a part-time...,Butcher,"[butcher, 's, table, looking, part-time, food,..."
4,Job Details Job Location : Torrance GSM - TORR...,Autopsy Assistant,"[job, detail, job, location, :, torrance, gsm,..."


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib 

# Join tokens back into a single string for each row
df['processed_text'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_text'])

joblib.dump(tfidf_vectorizer, "tfidf_vectorizer_better.joblib")

# TF-IDF matrix to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.head()

Unnamed: 0,00,000,0000,00000,000000,0000017479,00001,00001491,00005013,00006262,...,zzzinactive,área,áreas,élan,élevage,équivalent,éxito,ía,óptimas,ﬂexibility
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.010968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.017037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
tfidf_df.nunique()

00            2667
000           3382
0000             3
00000            6
000000           2
              ... 
équivalent       2
éxito            2
ía               2
óptimas          2
ﬂexibility       2
Length: 56705, dtype: int64

In [10]:
tfidf_df["label"]

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
22681    0.0
22682    0.0
22683    0.0
22684    0.0
22685    0.0
Name: label, Length: 22686, dtype: float64

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

y = tfidf_df["label"]

# Define the pipeline
svd = TruncatedSVD()  # Leave n_components unspecified
clf = LogisticRegression()

pipeline = Pipeline([
    ('svd', svd),
    ('clf', clf)
])

# Define the parameter grid: Try different numbers of components
param_grid = {
    'svd__n_components': [10000, 20000, 30000, 40000]
}

# Use GridSearchCV to find the best number of components
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', verbose=3)

grid_search.fit(tfidf_matrix, y)  

# The best number of components
print("Best number of components:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END .............svd__n_components=10000;, score=nan total time=77.0min


MemoryError: Unable to allocate 4.23 GiB for an array with shape (56705, 10010) and data type float64

Exception ignored in: 'scipy.linalg._decomp_lu_cython.lu_decompose'
Traceback (most recent call last):
  File "c:\Users\hp\AppData\Local\Programs\Python\Python311\Lib\site-packages\scipy\linalg\_decomp_lu.py", line 344, in lu
    lu_dispatcher(a1, u, p, permute_l)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 4.23 GiB for an array with shape (56705, 10010) and data type float64


In [11]:
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# SVD with n_components=10000
svd = TruncatedSVD(n_components=10000)
X_reduced = svd.fit_transform(tfidf_matrix)

clf = LogisticRegression()
clf.fit(X_reduced, y)

scores = cross_val_score(clf, X_reduced, y, cv=5, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())

# Save the model and the SVD transformer
joblib.dump(clf, 'logistic_regression_model_10000.joblib')
joblib.dump(svd, 'svd_transformer_10000.joblib')

print("Model and SVD transformer saved successfully.")

NameError: name 'y' is not defined