In [1]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = "/content/resume_dataset_git(RR).csv"
df = pd.read_csv(file_path)

# Display basic info and first few rows
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1219 entries, 0 to 1218
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   ID        1219 non-null   int64 
 1   Category  1219 non-null   object
 2   Resume    1219 non-null   object
dtypes: int64(1), object(2)
memory usage: 28.7+ KB


(None,
    ID Category                                             Resume
 0   1       HR  b'John H. Smith, P.H.R.\n800-991-5187 | PO Box...
 1   2       HR  b'Name Surname\nAddress\nMobile No/Email\nPERS...
 2   3       HR  b'Anthony Brown\nHR Assistant\nAREAS OF EXPERT...
 3   4       HR  b'www.downloadmela.com\nSatheesh\nEMAIL ID:\nC...
 4   5       HR  b"HUMAN RESOURCES DIRECTOR\n\xef\x82\xb7Expert...)

In [2]:
#remove all except 50

#df = df.head(50)

In [3]:
# Cleaning the 'Resume' column
df["Resume"] = (
    df["Resume"]
    .astype(str)
    .str.replace(r"^b'|b\"", "", regex=True)  # Remove byte string prefixes
    .str.replace(r"\\x\w{2}", "", regex=True)  # Remove hex escape sequences
    .str.replace(r"\s+", " ", regex=True)  # Replace newlines and multiple spaces with a single space
    .str.replace(r"\\n", " ", regex=True)
    .str.strip()  # Trim leading/trailing spaces
)

# Display cleaned sample resumes
df["Resume"] = df["Resume"].str.lower()
print(df["Category"].head(2))


0    HR
1    HR
Name: Category, dtype: object


In [4]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already present
nltk.download("stopwords")

# Get English stopwords
stop_words = set(stopwords.words("english"))

# Function to remove stopwords from text
def remove_stopwords(text):
    words = text.split()  # Tokenize by spaces
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return " ".join(filtered_words)  # Reconstruct text

# Apply function to 'Resume' column
df["Resume"] = df["Resume"].astype(str).apply(remove_stopwords)

# Display cleaned sample resumes
df["Resume"].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,Resume
0,"john h. smith, p.h.r. 800-991-5187 | po box 16..."
1,name surname address mobile no/email personal ...
2,anthony brown hr assistant areas expertise per...
3,www.downloadmela.com satheesh email id: career...
4,human resources director expert organizational...


In [5]:
import re

# Function to remove URLs from text
def remove_links(text):
    return re.sub(r"https?://\S+|www\.\S+", "", text)  # Remove URLs

# Apply function to 'Resume' column
df["Resume"] = df["Resume"].astype(str).apply(remove_links)

# Display cleaned sample resumes
df["Resume"].head()

Unnamed: 0,Resume
0,"john h. smith, p.h.r. 800-991-5187 | po box 16..."
1,name surname address mobile no/email personal ...
2,anthony brown hr assistant areas expertise per...
3,satheesh email id: career objective pursue gr...
4,human resources director expert organizational...


In [6]:
print(df["Resume"][1])

name surname address mobile no/email personal profile self motivated individual confident approach people. communicate well levels personnel feel good listening ability allows resolve problems quickly. enthusiastic role enjoy working hr, like fast paced environment always changing like adapt changes quickly allowing others also adapt quickly. organized nature like ensure date work. enjoy new challenges always keen learn new skills. employment history date date date hr consultant where? role hr consultant, visit clients provide hr advice help resolve issues. responsibilities include: provide employment law advice help writing issuing contracts employment employee handbooks advise maternity/paternity rights help payroll holidays, sickness etc building relationships new existing clients training new staff coming business managers deal staff regards personnel conduct disciplinary hearings appeals intermediary person devise staff benefits incentives learning development opportunities compan

In [7]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

# Download WordNet if not already present
nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("punkt")  # Required for tokenization
nltk.download('punkt_tab')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
from nltk.tokenize import word_tokenize
def lemmatize_text(text):
    words = word_tokenize(str(text))  # Tokenize
    lemmatized_words = [lemmatizer.lemmatize(word, pos="v") for word in words]  # Lemmatize as verbs
    return " ".join(lemmatized_words)  # Reconstruct sentence

# Apply to the 'Resume' column
df["Resume"] = df["Resume"].apply(lemmatize_text)
df["Resume"].head()

Unnamed: 0,Resume
0,"john h. smith , p.h.r . 800-991-5187 | po box ..."
1,name surname address mobile no/email personal ...
2,anthony brown hr assistant areas expertise per...
3,satheesh email id : career objective pursue gr...
4,human resources director expert organizational...


In [10]:
print(df["Resume"][0])

john h. smith , p.h.r . 800-991-5187 | po box 1673 | callahan , fl 32011 | info @ greatresumesfast.com approachable innovator passion human resources . senior human resources professional personable , analytical , flexible senior hr professional multifaceted expertise . season benefit administrator extensive experience work highly pay professionals client-relationship-based settings . dynamic team leader capable analyze alternatives identify tough choices communicate total value benefit compensation package senior level executives employees . core competencies benefit administration customer service cost control recruit acquisition management compliance report retention professional service domestic & international benefit collaboration adaptability change management define contribution plan audit negotiation corporate hr policies full lifecycle train 401 ( k ) form 5500 confidential file eeo-1 aap fmla std ltd h1-b visa vet 100 eap process payroll hr technology hris data management & 

In [11]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")
# Convert each resume into an embedding vector
df["bert_embedding"] = df["Resume"].apply(lambda x: model.encode(x))
print(df["bert_embedding"][0])
df.head()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

[-6.22978508e-02  2.90413983e-02 -8.29615444e-02  3.00440937e-02
  1.02988370e-02  4.14510369e-02  9.01495013e-03  4.90115136e-02
 -3.84691246e-02 -1.23207448e-02 -4.84754844e-03  1.40532739e-02
 -3.13100591e-02 -1.30394809e-02  2.08116006e-02  2.79346872e-02
  5.83925918e-02  6.68392051e-04  3.05012967e-02 -3.98479104e-02
 -4.20144089e-02 -1.03048813e-02 -5.45234866e-02 -6.47173673e-02
 -7.82293305e-02  1.45465685e-02 -5.38475141e-02  8.44685435e-02
 -5.12357727e-02 -7.59736970e-02  1.15262009e-02  1.43171260e-02
  6.46140426e-02  2.55033672e-02  6.61053285e-02  4.48149443e-02
 -5.90144545e-02  3.46769672e-03 -1.33602237e-02  2.19666138e-02
 -6.29604310e-02 -3.71701345e-02 -3.59395817e-02  1.83615368e-02
 -1.03837233e-02 -1.30979970e-01 -3.98740731e-02 -9.49764065e-03
 -3.63049866e-03  3.31343450e-02 -3.30623798e-02  3.98153439e-02
  4.36807163e-02  3.51634324e-02 -2.34371386e-02  2.75305938e-02
  2.36380156e-02 -1.59295350e-02 -6.49031475e-02 -6.21103458e-02
 -8.60917047e-02 -2.71835

Unnamed: 0,ID,Category,Resume,bert_embedding
0,1,HR,"john h. smith , p.h.r . 800-991-5187 | po box ...","[-0.06229785, 0.029041398, -0.082961544, 0.030..."
1,2,HR,name surname address mobile no/email personal ...,"[-0.064118676, 0.009727871, -0.0061487085, 0.0..."
2,3,HR,anthony brown hr assistant areas expertise per...,"[-0.043815237, 0.013882138, -0.07608884, 0.045..."
3,4,HR,satheesh email id : career objective pursue gr...,"[-0.02217126, -0.03456004, -0.051463455, 0.042..."
4,5,HR,human resources director expert organizational...,"[-0.04154574, -0.016407888, -0.020731276, 0.00..."


In [12]:
print(df["bert_embedding"][0])

[-6.22978508e-02  2.90413983e-02 -8.29615444e-02  3.00440937e-02
  1.02988370e-02  4.14510369e-02  9.01495013e-03  4.90115136e-02
 -3.84691246e-02 -1.23207448e-02 -4.84754844e-03  1.40532739e-02
 -3.13100591e-02 -1.30394809e-02  2.08116006e-02  2.79346872e-02
  5.83925918e-02  6.68392051e-04  3.05012967e-02 -3.98479104e-02
 -4.20144089e-02 -1.03048813e-02 -5.45234866e-02 -6.47173673e-02
 -7.82293305e-02  1.45465685e-02 -5.38475141e-02  8.44685435e-02
 -5.12357727e-02 -7.59736970e-02  1.15262009e-02  1.43171260e-02
  6.46140426e-02  2.55033672e-02  6.61053285e-02  4.48149443e-02
 -5.90144545e-02  3.46769672e-03 -1.33602237e-02  2.19666138e-02
 -6.29604310e-02 -3.71701345e-02 -3.59395817e-02  1.83615368e-02
 -1.03837233e-02 -1.30979970e-01 -3.98740731e-02 -9.49764065e-03
 -3.63049866e-03  3.31343450e-02 -3.30623798e-02  3.98153439e-02
  4.36807163e-02  3.51634324e-02 -2.34371386e-02  2.75305938e-02
  2.36380156e-02 -1.59295350e-02 -6.49031475e-02 -6.21103458e-02
 -8.60917047e-02 -2.71835

In [13]:
X_np = np.vstack(df["bert_embedding"].values)
print(np.shape(X_np))
print(X_np.T)

(1219, 384)
[[-0.06229785 -0.06411868 -0.04381524 ... -0.03648912 -0.05202227
  -0.0998427 ]
 [ 0.0290414   0.00972787  0.01388214 ... -0.04718732 -0.01626554
  -0.0184462 ]
 [-0.08296154 -0.00614871 -0.07608884 ... -0.06346656 -0.06820641
   0.0126791 ]
 ...
 [-0.08884183  0.01232267 -0.02292597 ... -0.03507937 -0.05126786
   0.00164326]
 [ 0.01802645 -0.00023794 -0.0472453  ... -0.08468697 -0.10429535
  -0.06334233]
 [ 0.05054363  0.03612588  0.03140382 ... -0.0478464  -0.02173669
   0.0026759 ]]


In [14]:
from sklearn.decomposition import PCA
pca = PCA(n_components=30)
X_pca = pca.fit_transform(X_np)

In [15]:
print(X_pca)
print(np.shape(X_pca))

[[ 0.06409433  0.16611125  0.00982554 ...  0.01155952 -0.02645434
   0.03945215]
 [ 0.19205984 -0.05199929 -0.27415133 ...  0.00138139 -0.04791559
  -0.0378284 ]
 [ 0.18889123  0.03528832 -0.13378061 ...  0.0349296  -0.08024587
   0.11119286]
 ...
 [-0.26341993  0.00151055  0.00146082 ... -0.01735138  0.1898643
   0.02130146]
 [-0.15133038  0.03799248  0.18932073 ...  0.07120549  0.20401286
   0.07275383]
 [-0.00337227 -0.08553436 -0.5031019  ...  0.02437333 -0.04150155
   0.01151074]]
(1219, 30)


In [16]:
from sklearn.cluster import KMeans
k = 25
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_pca)

In [17]:
df["Cluster"] = clusters

In [18]:
df[["Resume","Cluster"]].head()

Unnamed: 0,Resume,Cluster
0,"john h. smith , p.h.r . 800-991-5187 | po box ...",3
1,name surname address mobile no/email personal ...,3
2,anthony brown hr assistant areas expertise per...,3
3,satheesh email id : career objective pursue gr...,3
4,human resources director expert organizational...,3


In [19]:
from IPython.display import display
df_x = df.drop("bert_embedding",axis = 1)
df_x = df_x.drop("Resume",axis = 1)
display(df_x)

Unnamed: 0,ID,Category,Cluster
0,1,HR,3
1,2,HR,3
2,3,HR,3
3,4,HR,3
4,5,HR,3
...,...,...,...
1214,1215,Aviation,22
1215,1216,Aviation,22
1216,1217,Aviation,0
1217,1218,Aviation,0


In [20]:
from sklearn.metrics import accuracy_score
from scipy.optimize import linear_sum_assignment
from sklearn.preprocessing import LabelEncoder

import numpy as np

def cluster_accuracy(y_true, y_pred):
    """
    Computes accuracy after optimal label matching.
    """
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Build confusion matrix
    D = max(y_pred.max(), y_true.max()) + 1
    confusion_matrix = np.zeros((D, D), dtype=int)
    for i in range(len(y_true)):
        confusion_matrix[y_pred[i], y_true[i]] += 1

    # Use Hungarian algorithm to find best match
    from scipy.optimize import linear_sum_assignment
    row_ind, col_ind = linear_sum_assignment(-confusion_matrix)
    accuracy = confusion_matrix[row_ind, col_ind].sum() / y_true.size

    return accuracy

le = LabelEncoder()
y_encoded = le.fit_transform(df_x["Category"])
print(y_encoded)
acc = cluster_accuracy(y_encoded,df["Cluster"])
print(acc)

[19 19 19 ...  7  7  7]
0.4495488105004102


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y_encoded, test_size=0.3, random_state=42)

In [23]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

linear_svm = SVC(kernel="linear")
linear_svm.fit(X_train,y_train)
poly_svm = SVC(kernel="poly", degree=3)
poly_svm.fit(X_train,y_train)
rbf_svm = SVC(kernel="rbf",gamma="scale")
rbf_svm.fit(X_train, y_train)

linear_acc = accuracy_score(y_test,linear_svm.predict(X_test))
poly_acc = accuracy_score(y_test,poly_svm.predict(X_test))
rbf_acc = accuracy_score(y_test,rbf_svm.predict(X_test))

print("Linear SVM Accuracy:", linear_acc)
print("Polynomial SVM Accuracy:", poly_acc)
print("RBF SVM Accuracy:", rbf_acc)



Linear SVM Accuracy: 0.6639344262295082
Polynomial SVM Accuracy: 0.6174863387978142
RBF SVM Accuracy: 0.6666666666666666


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report



model = MLPClassifier(hidden_layer_sizes=(10, 10),
                      activation='logistic',
                      solver='adam',
                      max_iter=1000,
                      random_state=42)

# Train the model
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5273224043715847

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.48      0.52        23
           1       0.56      0.86      0.68        21
           2       0.25      0.50      0.33         4
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         4
           5       0.27      0.70      0.39        10
           6       0.00      0.00      0.00         7
           7       0.00      0.00      0.00         5
           8       0.00      0.00      0.00        14
           9       0.50      0.67      0.57         9
          10       0.00      0.00      0.00        10
          11       0.00      0.00      0.00        11
          12       0.00      0.00      0.00         8
          13       0.40      0.44      0.42        18
          14       0.12      0.13      0.13        15
          15       0.70      0.75      0.72        40
          16       0.68    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Define the parameter grid
param_grid = {
    'hidden_layer_sizes': [(10,), (10, 10), (50,), (50, 50), (100,), (100, 100)],
    'activation': ['relu', 'tanh', 'logistic'],
    'solver': ['adam', 'sgd'],
    'alpha': [0.0001, 0.001, 0.01],  # L2 penalty (regularization)
    'learning_rate': ['constant', 'adaptive'],
}

# Initialize base model
mlp = MLPClassifier(max_iter=1000, random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=mlp, param_grid=param_grid,
                           scoring='accuracy', cv=5, verbose=2, n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Hyperparameters:\n", grid_search.best_params_)

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Fitting 5 folds for each of 216 candidates, totalling 1080 fits
