In [54]:
import spacy

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    doc = nlp(text)
    # Extract lemmatized tokens
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    print(tokens)
    return tokens


In [55]:
import numpy as np

def build_co_occurrence_matrix(texts):
    vocabulary = set()
    for text in texts:
        tokens = preprocess_text(text)
        vocabulary.update(tokens)

    vocabulary = list(vocabulary)
    vocab_size = len(vocabulary)
    co_occurrence_matrix = np.zeros((vocab_size, vocab_size))

    for text in texts:
        tokens = preprocess_text(text)
        for i, token1 in enumerate(tokens):
            for j, token2 in enumerate(tokens):
                if i != j:
                    index1, index2 = vocabulary.index(token1), vocabulary.index(token2)
                    co_occurrence_matrix[index1][index2] += 1

    return co_occurrence_matrix, vocabulary


In [101]:
def train_glove(co_occurrence_matrix, vector_size=100, learning_rate=0.05, epochs=100):
#     np.random.seed(42)
    vocab_size = co_occurrence_matrix.shape[0]
    W = np.random.rand(vocab_size, vector_size)
    b = np.random.rand(vocab_size)
    total_loss = 0

    for epoch in range(epochs):
        for i in range(vocab_size):
            for j in range(vocab_size):
                if co_occurrence_matrix[i][j] > 0:
                    f_ij = np.dot(W[i], W[j]) + b[i] + b[j]
                    loss_ij = (f_ij - np.log(co_occurrence_matrix[i][j])) ** 2
                    total_loss += loss_ij

                    gradient_W_i = 2 * (f_ij - np.log(co_occurrence_matrix[i][j])) * W[j]
                    gradient_W_j = 2 * (f_ij - np.log(co_occurrence_matrix[i][j])) * W[i]
                    gradient_b_i = 2 * (f_ij - np.log(co_occurrence_matrix[i][j]))
                    gradient_b_j = 2 * (f_ij - np.log(co_occurrence_matrix[i][j]))

                    W[i] -= learning_rate * gradient_W_i
                    W[j] -= learning_rate * gradient_W_j
                    b[i] -= learning_rate * gradient_b_i
                    b[j] -= learning_rate * gradient_b_j

        avg_loss = total_loss / (vocab_size ** 2)
        print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss}")

    return W, b


In [102]:
def rank_texts(texts, keyword, W, vocabulary):
    keyword_vector = W[vocabulary.index(keyword)]
    text_scores = []

    for text in texts:
        tokens = preprocess_text(text)
        text_vector = np.mean([W[vocabulary.index(token)] for token in tokens if token in vocabulary], axis=0)
        similarity = np.dot(text_vector, keyword_vector) / (np.linalg.norm(text_vector) * np.linalg.norm(keyword_vector))
        text_scores.append((text, similarity))

    ranked_texts = sorted(text_scores, key=lambda x: x[1], reverse=True)
    return ranked_texts


In [103]:
import pandas as pd

df = pd.read_csv("../data/external/potential-talents.csv").set_index('id')
df.head()

Unnamed: 0_level_0,job_title,location,connection,fit
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
4,People Development Coordinator at Ryan,"Denton, Texas",500+,
5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [104]:
df_no_duplicates = df.drop_duplicates()
print('shape of the dataframe without duplicates: ', df_no_duplicates.shape)

shape of the dataframe without duplicates:  (53, 4)


In [105]:
df_no_duplicates_fitFeature = df_no_duplicates.drop('fit', axis=1)
df_v1 = df_no_duplicates_fitFeature.copy()

In [108]:
# Example usage
texts = df_v1['job_title'].tolist()  # Your list of texts
co_occurrence_matrix, vocabulary = build_co_occurrence_matrix(texts)

print('Matrix: ', co_occurrence_matrix)
print('Vocabulary: ', vocabulary)

W, b = train_glove(co_occurrence_matrix)
keyword = "Human"
ranked_texts = rank_texts(texts, keyword, W, vocabulary)

# Display the ranked texts
for i, (text, score) in enumerate(ranked_texts):
    print(f"{i+1}. {text} - Score: {score}")

# # Test Data
# texts = [
#     "Natural language processing is a subfield of artificial intelligence.",
#     "Machine learning algorithms can analyze large sets of data.",
#     "Word embeddings capture semantic relationships between words.",
#     "Deep learning models have achieved state-of-the-art results in various NLP tasks.",
#     "Text classification is a common natural language processing application.",
# ]

# # Keyword for Ranking
# keyword = "learning"

# # Test GloVe Implementation
# co_occurrence_matrix, vocabulary = build_co_occurrence_matrix(texts)
# W, b = train_glove(co_occurrence_matrix)
# ranked_texts = rank_texts(texts, keyword, W, vocabulary)

# # Display the ranked texts
# for i, (text, score) in enumerate(ranked_texts):
#     print(f"{i+1}. {text} - Score: {score}")



['2019', 'C.T.', 'Bauer', 'College', 'Business', 'Graduate', 'Magna', 'Cum', 'Laude', 'aspire', 'Human', 'Resources', 'professional']
['native', 'English', 'teacher', 'EPIK', 'English', 'Program', 'Korea']
['aspire', 'Human', 'Resources', 'Professional']
['People', 'Development', 'Coordinator', 'Ryan']
['Advisory', 'Board', 'Member', 'Celal', 'Bayar', 'University']
['aspire', 'Human', 'Resources', 'Specialist']
['student', 'Humber', 'College', 'Aspiring', 'Human', 'Resources', 'Generalist']
['HR', 'Senior', 'Specialist']
['seek', 'Human', 'Resources', 'HRIS', 'Generalist', 'Positions']
['student', 'Chapman', 'University']
['SVP', 'CHRO', 'Marketing', 'Communications', 'CSR', 'Officer', '|', 'ENGIE', '|', 'Houston', '|', 'Woodlands', '|', 'Energy', '|', 'GPHR', '|', 'SPHR']
['Human', 'Resources', 'Coordinator', 'InterContinental', 'Buckhead', 'Atlanta']
['aspire', 'Human', 'Resources', 'Management', 'student', 'seek', 'internship']
['seek', 'Human', 'Resources', 'opportunity']
['experie

  loss_ij = (f_ij - np.log(co_occurrence_matrix[i][j])) ** 2
  gradient_W_j = 2 * (f_ij - np.log(co_occurrence_matrix[i][j])) * W[i]
  gradient_W_i = 2 * (f_ij - np.log(co_occurrence_matrix[i][j])) * W[j]


Epoch 3/100, Average Loss: nan
Epoch 4/100, Average Loss: nan
Epoch 5/100, Average Loss: nan
Epoch 6/100, Average Loss: nan
Epoch 7/100, Average Loss: nan
Epoch 8/100, Average Loss: nan
Epoch 9/100, Average Loss: nan
Epoch 10/100, Average Loss: nan
Epoch 11/100, Average Loss: nan
Epoch 12/100, Average Loss: nan
Epoch 13/100, Average Loss: nan
Epoch 14/100, Average Loss: nan
Epoch 15/100, Average Loss: nan
Epoch 16/100, Average Loss: nan
Epoch 17/100, Average Loss: nan
Epoch 18/100, Average Loss: nan
Epoch 19/100, Average Loss: nan
Epoch 20/100, Average Loss: nan
Epoch 21/100, Average Loss: nan
Epoch 22/100, Average Loss: nan
Epoch 23/100, Average Loss: nan
Epoch 24/100, Average Loss: nan
Epoch 25/100, Average Loss: nan
Epoch 26/100, Average Loss: nan
Epoch 27/100, Average Loss: nan
Epoch 28/100, Average Loss: nan
Epoch 29/100, Average Loss: nan
Epoch 30/100, Average Loss: nan
Epoch 31/100, Average Loss: nan
Epoch 32/100, Average Loss: nan
Epoch 33/100, Average Loss: nan
Epoch 34/100, A

In [98]:
my_list = df_v1['job_title'].tolist()
my_list

['2019 C.T. Bauer College of Business Graduate (Magna Cum Laude) and aspiring Human Resources professional',
 'Native English Teacher at EPIK (English Program in Korea)',
 'Aspiring Human Resources Professional',
 'People Development Coordinator at Ryan',
 'Advisory Board Member at Celal Bayar University',
 'Aspiring Human Resources Specialist',
 'Student at Humber College and Aspiring Human Resources Generalist',
 'HR Senior Specialist',
 'Seeking Human Resources HRIS and Generalist Positions',
 'Student at Chapman University',
 'SVP, CHRO, Marketing & Communications, CSR Officer | ENGIE | Houston | The Woodlands | Energy | GPHR | SPHR',
 'Human Resources Coordinator at InterContinental Buckhead Atlanta',
 'Aspiring Human Resources Management student seeking an internship',
 'Seeking Human Resources Opportunities',
 'Experienced Retail Manager and aspiring Human Resources Professional',
 'Human Resources, Staffing and Recruiting Professional',
 'Human Resources Specialist at Luxottica