# **Crawling the text from the targeted sites**



In [17]:
from bs4 import BeautifulSoup
import requests
from itertools import zip_longest
def extract_job_data(page_num):
    try:
        result = requests.get(f"https://wuzzuf.net/search/jobs/?a=spbg&q=python&start={page_num}")
        src = result.content
        soup = BeautifulSoup(src, "lxml")

        job_titles = soup.find_all("h2", {"class": "css-m604qf"})
        job_descriptions = soup.find_all("div", {"class": "css-y4udm8"})
        company_names = soup.find_all("a", {"class": "css-17s97q8"})
        location_names = soup.find_all("span", {"class": "css-5wys0k"})

        job_data = []
        for title, desc, company, location in zip_longest(job_titles, job_descriptions, company_names, location_names):
            data = {
                "title": title.text.strip() if title else None,
                "description": desc.text.strip() if desc else None,
                "company": company.text.strip() if company else None,
                "location": location.text.strip() if location else None
            }
            job_data.append(data)

        return job_data

    except Exception as e:
        print("An error occurred:", e)
        return []

def save_to_file(job_data, file_name):
    try:
        with open(file_name, 'w', encoding='utf-8') as file:
            for job in job_data:
                file.write(f"Title: {job['title']}\n")
                file.write(f"Company: {job['company']}\n")
                file.write(f"Description: {job['description']}\n")
                file.write(f"Location: {job['location']}\n\n")
        print(f"Data saved to {file_name}")
    except Exception as e:
        print("Error occurred while saving to file:", e)

def crawl_jobs():
    try:
        page_num = 0
        while True:
            job_data = extract_job_data(page_num)
            if not job_data:
                print("No more jobs to crawl.")
                break

            file_name = f"job_data_{page_num}.txt"
            save_to_file(job_data, file_name)

            page_num += 1

    except Exception as e:
        print("An error occurred:", e)

if __name__ == "__main__":
    crawl_jobs()


Data saved to job_data_0.txt
Data saved to job_data_1.txt
Data saved to job_data_2.txt
Data saved to job_data_3.txt
Data saved to job_data_4.txt
Data saved to job_data_5.txt
Data saved to job_data_6.txt
Data saved to job_data_7.txt
Data saved to job_data_8.txt
Data saved to job_data_9.txt
Data saved to job_data_10.txt
Data saved to job_data_11.txt
Data saved to job_data_12.txt
Data saved to job_data_13.txt
Data saved to job_data_14.txt
Data saved to job_data_15.txt
No more jobs to crawl.


# Save all crawled documents in one specific folder

In [None]:
import glob
documents = []
# Find all files with the extension .txt
files = glob.glob('*.txt')

# Open each file and read its contents
for file in files:
    with open(file, 'r') as f:
        links_content = f.read()
        # Process the contents of the file

        # Append the content of the links to the documents_list
        documents.append(links_content)
        print (documents)


# Preprocessing and transform the text


In [None]:

import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Normalization: Convert to lowercase
    tokens = [word.lower() for word in tokens]

    # Remove punctuation
    tokens = [word for word in tokens if word not in string.punctuation]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)






# Creating an inverted index

In [None]:

# Preprocess the documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]
print (documents)

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()


# Fit the vectorizer and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(preprocessed_documents)

# Get the feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Create inverted index
inverted_index = defaultdict(list)
for i, doc in enumerate(preprocessed_documents):
    for word in doc.split():
        inverted_index[word].append(i)

# Saving the transformed text

In [None]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define file paths
file_paths = [
    r"/content/job_data_0.txt",
    r"/content/job_data_1.txt",
    r"/content/job_data_2.txt",
    r"/content/job_data_3.txt",
    r"/content/job_data_4.txt",
    r"/content/job_data_5.txt",
    r"/content/job_data_6.txt",
    r"/content/job_data_7.txt",
    r"/content/job_data_8.txt",
    r"/content/job_data_9.txt"
]

In [None]:
documents = []
for file_path in file_paths:
    with open(file_path, 'r', encoding='utf-8') as file:
        documents.append(file.read())

In [None]:
query = "software"

# **Check the similarity**

In [None]:
vectorizer = TfidfVectorizer()

vectors = vectorizer.fit_transform(documents)

query_vector = vectorizer.transform([query])

In [None]:
cosine_similarities = cosine_similarity(query_vector, vectors)

In [None]:
sorted_indices = np.argsort(cosine_similarities[0])[::-1]

ranked_documents = [(file_paths[idx], cosine_similarities[0][idx]) for idx in sorted_indices]

# **Rank the documents according thier similarity**

In [None]:
for idx, (file_path, score) in enumerate(ranked_documents, start=1):
    print(f"Rank {idx}: Similarity Score: {score}")
    print(f"Document: {file_path}\n")

In [None]:
# Rank documents by decreasing similarity score
similarity_scores = cosine_similarities.flatten()
document_scores = [(document, score) for document, score in zip(documents, similarity_scores)]
sorted_documents = sorted(document_scores, key=lambda x: x[1], reverse=True)

# Display ranked documents
print("Ranked documents by decreasing similarity score:")
for i, (doc, score) in enumerate(sorted_documents):
    print(i+1, ":", doc, "(Similarity Score:", score, ")")

In [None]:

# Get indices of top X documents
X = 5
top_indices = cosine_similarities.argsort()[0][-X:][::-1]

# Return top X documents
top_documents = [documents[i] for i in top_indices]
print("Top", X, "documents:")
for i, doc in enumerate(top_documents):
    print(i+1, ":", doc)

# **Test the model**

In [None]:
# Example query
query = input ("please enter your query ")

# Preprocess the query
preprocessed_query = preprocess_text(query)

# Represent the query as a vector
query_vector = tfidf_vectorizer.transform([preprocessed_query])

print("Query Vector:")
print(query_vector.toarray())

print("\nInverted Index:")
for word, doc_ids in inverted_index.items():
    print(word, ":", doc_ids)