In [260]:
###############################################################
# Author: walterwhites
# Hackathon: CrackedDevs Hackathon Jan 2024
# This code is subject Devpost Hackathon and restrictions.
###############################################################

In [261]:
import pandas as pd

df = pd.read_csv('jobs_data.csv')


In [262]:
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk import sent_tokenize, WordNetLemmatizer
import re
import numpy as np

def search(query, model, descriptions):
    query = preprocess_text(query)
    query_vector = np.mean([model.wv[word] for word in query.split() if word in model.wv], axis=0)

    description_vectors = [np.mean([model.wv[word] for word in desc.split() if word in model.wv], axis=0) for desc in descriptions]

    cosine_similarities = [linear_kernel([query_vector], [desc_vector]).flatten()[0] for desc_vector in description_vectors]

    results = pd.DataFrame({'id': df['id'], 'Description': df['cleaned_description'], 'Similarity': cosine_similarities})
    results = results.sort_values(by='Similarity', ascending=False)
    
    return results

def preprocess_text(text):
    text = lowercase_text(text)
    text = remove_urls(text)
    # Tokenization
    word_tokens = tokenization(text)
    word_tokens = clear_ponctuation(word_tokens)
    word_tokens = custom_clean(word_tokens)
    # Lemmatisation
    lemmatized_words = lemmatization(word_tokens)
    # Suppression des stopwords
    cleaned_text = remove_stopwords(lemmatized_words)
    cleaned_text_str = ' '.join(cleaned_text)  # Convert the list to a string
    return cleaned_text_str

def remove_urls(text):
    regex = r'https?://\S+|www\.\S+'
    text = re.sub(regex, '', text)
    return text

def lowercase_text(text):
    return text.lower()

def lemmatization(word_tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in word_tokens]

def tokenization(text):
    return sent_tokenize(text)

def custom_clean(text):
    # Remplacer les sauts de lignes par des espaces
    word_tokens = [word_token.replace('\n', ' ') for word_token in text]
    # Remplacer les non-breaking space (nbsp) par des espaces
    word_tokens = [re.sub(r'\xa0', ' ', phrase) for phrase in word_tokens]
    # Supprimer les espaces en trop
    word_tokens = [re.sub(r'\s+', ' ', phrase) for phrase in word_tokens]
    return word_tokens

def remove_stopwords(sentences):
    stop_words = set(stopwords.words('english'))
    result = []
    for sentence in sentences:
        words = sentence.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        result.append(' '.join(filtered_words))
    return result

def clear_ponctuation(text):
    return [re.sub(r'[^a-zA-Z0-9\s]', '', phrase) for phrase in text]

def extract_text_from_body(html_body):
    soup = BeautifulSoup(html_body, 'html.parser')
    return soup.get_text()


In [263]:
# Clean HTML tags
df['description'] = df['description'].apply(extract_text_from_body)

# Clean Text 
df['cleaned_description'] = df['description'].apply(preprocess_text)
df[['id', 'cleaned_description']].to_csv('cleaned_jobs_data.csv', index=False)
df.to_pickle("api/cleaned_jobs_data.pkl")

In [264]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import linear_kernel

model = Word2Vec(sentences=df['cleaned_description'].apply(lambda x: x.split()), vector_size=200, window=10, min_count=1, workers=4, epochs=20)

query = "blockchain"
search_results = search(query, model, df['cleaned_description'])
print(search_results.head(10))


      id                                        Description  Similarity
29  3082  we3 leading professional network women nonbina...   31.929531
27  3084  building talent pool highly skilled backend en...   31.259947
26  3095  rampcom est compensation 150kyr series 300 emp...   30.442137
31  3077  building talent pool highly skilled frontend e...   30.432049
30  3080  series c combinator wellness company est compe...   30.056515
4   3970  seeking highly motivated individual join team ...   28.836357
25  3098  rampcom est compensation 150kyr series 300 emp...   28.411894
24  3100  job description remote position moonward capit...   27.862480
37  3006  coingecko global leader tracking cryptocurrenc...   25.193859
0   4042  search passionate organized individual take ro...   24.488400


In [265]:
from joblib import dump

dump(model, 'model_Word2Vec.joblib')

['model_Word2Vec.joblib']