In [1]:
import pandas as pd
from itertools import chain, combinations
from collections import defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk


In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\26939\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\26939\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [38]:
dataset_path = 'Dataset/job_descriptions.csv'
pd.read_csv(dataset_path).head(100).to_csv('sample_dataset.csv')

#### DATA PROCESSING

In [4]:
# Initialize the Porter Stemmer
stemmer = PorterStemmer()

# Get the English stopwords
stop_words = set(stopwords.words('english'))

# Define a tokenizer function that also removes stopwords
def tokenize(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and non-alphabetic tokens
    # tokens = [stemmer.stem(word) for word in tokens if word.isalpha() and word.lower() not in stop_words]
    tokens = [word for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

# Initialize the CountVectorizer with the custom tokenizer
vectorizer = CountVectorizer(tokenizer=tokenize, binary=True, lowercase=True)

# Set the chunk size
chunk_size = 50000  # Adjust this value based on your memory capacity

dataset_path = 'Dataset/job_descriptions.csv'
fit_sample = pd.read_csv(dataset_path, nrows=1000)  # Adjust nrows as needed

# tokenized_skills = fit_sample['skills'].apply(lambda x: ' '.join(tokenize(x)))

all_text_fit =  fit_sample['Job Title']
vectorizer.fit(all_text_fit)
chunks = pd.read_csv(dataset_path, chunksize=chunk_size, usecols=['Job Title'])

# Process each chunk
for i, chunk in enumerate(chunks):
    print(f'Processing chunk {i}')
    # Concatenate all text columns into one to vectorize
    # Tokenize 'Job Title' and 'Role' columns
    tokenized_job_titles = chunk['Job Title'].apply(lambda x: ' '.join(tokenize(x)))
    # tokenized_roles = chunk['Role'].apply(lambda x: ' '.join(tokenize(x)))

    # Concatenate tokenized 'Job Title' and 'Role'
    all_text = tokenized_job_titles

    # all_text = chunk['Job Title'] + " " + chunk['Role']
    
    # Fit the vectorizer and transform the text into a binary matrix
    X_chunk = vectorizer.transform(all_text)
    
    # Convert to dataframe
    binary_df_chunk = pd.DataFrame(X_chunk.toarray(), columns=vectorizer.get_feature_names_out())
    
    # Save the binary dataframe chunk to a CSV file
    binary_df_chunk.to_csv(f'Dataset\job_posting_chunks\jpc{i}.csv', index=False)

    print(f'Chunk {i} processed and saved.')


  binary_df_chunk.to_csv(f'Dataset\job_posting_chunks\jpc{i}.csv', index=False)


Processing chunk 0
Chunk 0 processed and saved.
Processing chunk 1
Chunk 1 processed and saved.
Processing chunk 2
Chunk 2 processed and saved.
Processing chunk 3
Chunk 3 processed and saved.
Processing chunk 4
Chunk 4 processed and saved.
Processing chunk 5
Chunk 5 processed and saved.
Processing chunk 6
Chunk 6 processed and saved.
Processing chunk 7
Chunk 7 processed and saved.
Processing chunk 8
Chunk 8 processed and saved.
Processing chunk 9
Chunk 9 processed and saved.
Processing chunk 10
Chunk 10 processed and saved.
Processing chunk 11
Chunk 11 processed and saved.
Processing chunk 12
Chunk 12 processed and saved.
Processing chunk 13
Chunk 13 processed and saved.
Processing chunk 14
Chunk 14 processed and saved.
Processing chunk 15
Chunk 15 processed and saved.
Processing chunk 16
Chunk 16 processed and saved.
Processing chunk 17
Chunk 17 processed and saved.
Processing chunk 18
Chunk 18 processed and saved.
Processing chunk 19
Chunk 19 processed and saved.
Processing chunk 20


In [32]:
all_text_fit

0      Digital Marketing Specialist Social Media Manager
1                   Web Developer Frontend Web Developer
2             Operations Manager Quality Control Manager
3             Network Engineer Wireless Network Engineer
4                       Event Manager Conference Manager
                             ...                        
995                  Urban Planner Environmental Planner
996       Mechanical Engineer Mechanical Design Engineer
997                               IT Manager IT Director
998                     UI Developer Front-End Developer
999    Landscape Architect Residential Landscape Desi...
Length: 1000, dtype: object

In [5]:
pd.read_csv('Dataset/job_posting_chunks/jpc0.csv').head(100).to_csv('sample.csv')

#### APRIORI

In [7]:
def preprocess_chunk(chunk):
    transactions = []
    # Iterate over each row in the chunk
    for index, row in chunk.iterrows():
        # Create a set for the current transaction
        transaction = frozenset(row.index[row == 1].tolist())
        transactions.append(transaction)
    return transactions

# sample_data = pd.read_csv(sample_file_path)
# test_transactions = preprocess_chunk(sample_data.drop(columns=sample_data.columns[0]))
# test_transactions[:5]  

In [6]:
# Count support for each item in a chunk
def count_support(transactions):
    item_count = defaultdict(int)
    for transaction in transactions:
        for item in transaction:
            item_count[item] += 1
    # print(f"Count Support: {item_count}")
    return item_count

# Update the process_chunks function to handle the binary matrix format
def process_chunks(chunk_paths):
    global_item_counts = defaultdict(int)
    for path in chunk_paths:
        chunk = pd.read_csv(path, index_col=0)  # Read the chunk and set the first column as index
        transactions = preprocess_chunk(chunk)
        local_item_counts = count_support(transactions)
        # Aggregate the counts from this chunk to the global counts
        for item, count in local_item_counts.items():
            global_item_counts[item] += count
    return global_item_counts

# Since we only have one sample file for now, we'll create a list containing just that path
# chunk_paths = [sample_file_path]
# Call the process_chunks function with the single file path
# global_item_counts = process_chunks(chunk_paths)

# Show the global item counts to verify if the function works correctly
# global_item_counts

In [8]:
# Get total transactions from chunks
def get_total_transactions(chunk_paths):
    total_transactions = 0
    for path in chunk_paths:
        with open(path, 'r') as file:
            for line in file:
                total_transactions += 1
    print(f"Total Transactions: {total_transactions-1}")
    return total_transactions - 1

# Identify frequent itemsets from global counts
def get_frequent_itemsets(global_counts, min_support, total_transactions):
    frequent_itemsets = set()
    for candidate, count in global_counts.items():
        if count / total_transactions >= min_support:
           frequent_itemsets.add(frozenset[candidate])
    return frequent_itemsets

# Step 2: Generate candidate k-itemsets
def generate_candidates(frequent_itemsets, k):
    candidates = set()
    frequent_itemsets_list = list(frequent_itemsets)  # Convert to list to allow indexing.

    for i in range(len(frequent_itemsets_list)):
        for j in range(i+1, len(frequent_itemsets_list)):
            itemset1 = frequent_itemsets_list[i]
            itemset2 = frequent_itemsets_list[j]
            # Ensure that both itemset1 and itemset2 are frozenset instances
            if isinstance(itemset1, frozenset) and isinstance(itemset2, frozenset):
                union_itemset = itemset1.union(itemset2)
                intersection_itemset = itemset1.intersection(itemset2)
                
                if len(union_itemset) == k and len(intersection_itemset) == k-2:
                    candidates.add(union_itemset)
    return candidates


# Step 3: Prune candidate k-itemsets that dont have all k-1 frequent subsets
def generate_pruned_candidates(candidates,frequent_itemsets, k):
    pruned_candidates = set()
    for candidate in candidates:
        is_frequent = True
        for subset in combinations(candidate, k - 1):
            subset_frozen = frozenset(subset)

            # as long as one is not frequent, break and continue
            if subset_frozen not in frequent_itemsets:
                is_frequent = False
                break

        # if all subsets are frequent, add to pruned
        if is_frequent:
            pruned_candidates.add(candidate)
    return pruned_candidates

def get_candidate_counts(chunks_paths,candidates):
    candidate_counts = defaultdict(int)
    for path in chunks_paths:
        chunk = pd.read_csv(path, index_col=0)
        transactions = preprocess_chunk(chunk)

        for transaction in transactions:
            for candidate in candidates:
                if candidate.issubset(transaction):
                    candidate_counts[candidate] += 1
    return candidate_counts

def get_frequent_itemsets_from_candidates(candidate_counts, min_support, total_transactions):
    frequent_itemsets = set()
    for candidate, count in candidate_counts.items():
        if (count / total_transactions) >= min_support:
            frequent_itemsets.add(candidate)
    return frequent_itemsets

In [9]:
def apriori_algorithm(files, min_support):
    # Step 1: Process all chunks and get global counts and total transactions
    global_item_counts = defaultdict(int)
    total_transactions = 0
    for file_path in files:
        chunk = pd.read_csv(file_path, index_col=0)
        transactions = preprocess_chunk(chunk)
        local_item_counts = count_support(transactions)
        for item, count in local_item_counts.items():
            global_item_counts[item] += count
        total_transactions += len(chunk)

    # Step 2: Find frequent 1-itemsets
    frequent_itemsets = get_frequent_itemsets(global_item_counts, min_support, total_transactions)
    all_frequent_itemsets = {1: frequent_itemsets}

    # Step 3: Iteratively find all frequent itemsets
    k = 2
    while True:
        # Generate candidate k-itemsets
        candidates = generate_candidates(all_frequent_itemsets[k-1], k)
        # Prune candidate k-itemsets
        pruned_candidates = generate_pruned_candidates(candidates, all_frequent_itemsets[k-1], k)
        # Count the support of each pruned candidate itemset across all files
        candidate_counts = defaultdict(int)
        for file_path in files:
            chunk = pd.read_csv(file_path, index_col=0)
            transactions = preprocess_chunk(chunk)
            for candidate in pruned_candidates:
                if candidate.issubset(transactions):
                    candidate_counts[candidate] += 1
        # Identify the frequent k-itemsets
        frequent_k_itemsets = get_frequent_itemsets_from_candidates(candidate_counts, min_support, total_transactions)
        if frequent_k_itemsets:
            all_frequent_itemsets[k] = frequent_k_itemsets
        else:
            break
        k += 1

    return all_frequent_itemsets

# List of file paths
files = [f"Dataset/job_posting_chunks/jpc{i}.csv" for i in range(0, 33)]  # Adjust the pattern as needed

# Run the Apriori algorithm
min_support = 0.00001  # Adjust the minimum support threshold as needed
frequent_itemsets = apriori_algorithm(files, min_support)

# Display the frequent itemsets
print(frequent_itemsets)

{1: {frozenset['aerospace'], frozenset['accountant'], frozenset['interior'], frozenset['process'], frozenset['administrative'], frozenset['architectural'], frozenset['agent'], frozenset['consultant'], frozenset['family'], frozenset['director'], frozenset['associate'], frozenset['chemical'], frozenset['business'], frozenset['assistant'], frozenset['mechanical'], frozenset['pediatrician'], frozenset['qa'], frozenset['hr'], frozenset['speech'], frozenset['assurance'], frozenset['chain'], frozenset['researcher'], frozenset['registered'], frozenset['manager'], frozenset['structural'], frozenset['advisor'], frozenset['counselor'], frozenset['technician'], frozenset['customer'], frozenset['finance'], frozenset['media'], frozenset['inventory'], frozenset['pharmaceutical'], frozenset['civil'], frozenset['ux'], frozenset['coordinator'], frozenset['generalist'], frozenset['representative'], frozenset['product'], frozenset['counsel'], frozenset['relations'], frozenset['social'], frozenset['designe

In [10]:
def print_frequent_itemsets(frequent_itemsets):
    count = 0
    for k, itemsets in frequent_itemsets.items():
        print(f"== Frequent {k}-itemsets ==")
        for itemset in itemsets:
            # Ensure all elements are strings, convert frozenset to a sorted list and then to a string for printing
            itemset_list = [str(item) for item in sorted(list(itemset))]
            itemset_str = ", ".join(itemset_list)
            print(f"{itemset_str}")
            count+=1
        print("")  # Print a newline for better readability between different levels
    return count
        
# Example usage:
# Assuming 'frequent_itemsets' is your dictionary of frequent itemsets
count = print_frequent_itemsets(frequent_itemsets)
print(count)


== Frequent 1-itemsets ==
*frozenset['aerospace']
*frozenset['accountant']
*frozenset['interior']
*frozenset['process']
*frozenset['administrative']
*frozenset['architectural']
*frozenset['agent']
*frozenset['consultant']
*frozenset['family']
*frozenset['director']
*frozenset['associate']
*frozenset['chemical']
*frozenset['business']
*frozenset['assistant']
*frozenset['mechanical']
*frozenset['pediatrician']
*frozenset['qa']
*frozenset['hr']
*frozenset['speech']
*frozenset['assurance']
*frozenset['chain']
*frozenset['researcher']
*frozenset['registered']
*frozenset['manager']
*frozenset['structural']
*frozenset['advisor']
*frozenset['counselor']
*frozenset['technician']
*frozenset['customer']
*frozenset['finance']
*frozenset['media']
*frozenset['inventory']
*frozenset['pharmaceutical']
*frozenset['civil']
*frozenset['ux']
*frozenset['coordinator']
*frozenset['generalist']
*frozenset['representative']
*frozenset['product']
*frozenset['counsel']
*frozenset['relations']
*frozenset['social