# 1. Extract user IDs and occupations

In [118]:
import pandas as pd

users_path = "../data/raw/jobs-users"

users_df = pd.read_csv(users_path, sep=' ', header=None, names=['user_id', 'occupation_code'])
users_df['occupation_code'] = users_df['occupation_code'].astype(str).str[0].astype(int) # get first digit (major category)

In [119]:
categories_path = "../data/raw/major_categories"

major_categories_df = pd.read_csv(categories_path, sep=':', header=None, names=['occupation_code', 'category'])

users_df = pd.merge(users_df, major_categories_df, on='occupation_code', how='left')

In [120]:
print(f"Successfully loaded and processed {len(users_df)} users.")
users_df.head()

Successfully loaded and processed 5191 users.


Unnamed: 0,user_id,occupation_code,category
0,206749819,3,Associate Professional and Technical Occupations
1,706405455,8,"Process, Plant and Machine Operatives"
2,185699053,2,Professional Occupations
3,1134832688,2,Professional Occupations
4,36040783,2,Professional Occupations


In [121]:
print("\nValue counts for each major group:")
print(users_df['occupation_code'].value_counts())


Value counts for each major group:
occupation_code
2    1794
3    1073
5     871
1     504
6     318
8     218
4     198
9     147
7      68
Name: count, dtype: int64


# 2. Load user unigrams

In [122]:
def load_dictionary(filepath):
    """Loads the wordid-word mapping into a dictionary."""
    print(f"Loading dictionary from {filepath}...")
    word_dict = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 2:
                word_dict[parts[0]] = parts[1]
    print(f"Dictionary loaded with {len(word_dict)} words.")
    return word_dict

def load_unigrams(filepath):
    """Loads the user unigrams into a dictionary for fast lookup."""
    print(f"Loading user unigrams from {filepath}...")
    unigram_dict = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) > 1:
                user_id = parts[0]
                # Join the rest of the parts back into a single string
                features_str = ' '.join(parts[1:])
                unigram_dict[user_id] = features_str
    print(f"Unigrams loaded for {len(unigram_dict)} users.")
    return unigram_dict

In [123]:
# Load the data once
word_dictionary = load_dictionary('../data/raw/dictionary')
user_unigrams = load_unigrams('../data/raw/jobs-unigrams')

Loading dictionary from ../data/raw/dictionary...
Dictionary loaded with 71555 words.
Loading user unigrams from ../data/raw/jobs-unigrams...
Unigrams loaded for 5189 users.


In [124]:
from nltk.corpus import stopwords
import nltk

def get_user_word_array(user_id, unigrams, word_dict):
    """
    Creates an aggregated list of words for a user based on frequency.
    """
    # Convert user_id to string to match the dictionary keys
    user_id_str = str(user_id)
    
    # Check if the user has unigram data
    if user_id_str not in unigrams:
        return [] # Return an empty list if no data for this user

    features_str = unigrams[user_id_str]
    word_tokens = features_str.split()
    
    aggregated_words = []
    
    for token in word_tokens:
        try:
            word_id, frequency_str = token.split(':')
            frequency = int(frequency_str)
            
            # Look up the word and repeat it 'frequency' times
            if word_id in word_dict:
                word = word_dict[word_id]
                # list.extend() is efficient for adding multiple items
                if word.isalpha() and len(word) > 2 and word not in STOPWORDS:
                    aggregated_words.extend([word] * frequency)
        except ValueError:
            # Skip malformed tokens like 'word:freq:extra'
            continue
            
    return aggregated_words

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))
print(f"Loaded {len(STOPWORDS)} stopwords from NLTK.")

Loaded 198 stopwords from NLTK.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [125]:
from tqdm.auto import tqdm

print("\nProcessing users and creating the aggregated word column...")

tqdm.pandas(desc="Processing Users")

# We use .apply() on the 'user_id' column to generate the new data
users_df['aggregated_words'] = users_df['user_id'].progress_apply(
    lambda uid: get_user_word_array(uid, user_unigrams, word_dictionary)
)

print("Processing complete.")


Processing users and creating the aggregated word column...


Processing Users:   0%|          | 0/5191 [00:00<?, ?it/s]

Processing complete.


In [126]:
print("\nVerifying the updated DataFrame:")
print(users_df[users_df['user_id'] == 206749819]['aggregated_words'].values)


Verifying the updated DataFrame:
[list(['able', 'absolute', 'absolutely', 'access', 'access', 'access', 'access', 'access', 'accessed', 'acclaimed', 'account', 'act', 'action', 'addition', 'adjust', 'advantage', 'advertisement', 'advertising', 'advertising', 'advice', 'affiliate', 'affiliate', 'ahead', 'aimlessly', 'allows', 'allows', 'allows', 'allows', 'allows', 'allows', 'already', 'already', 'already', 'also', 'alternative', 'always', 'amazing', 'amazing', 'amazing', 'amazing', 'amazing', 'ammo', 'anded', 'announce', 'another', 'anybody', 'anyone', 'anyone', 'anytime', 'anytime', 'anywhere', 'anywhere', 'anywhere', 'anywhere', 'appointment', 'appropriate', 'area', 'area', 'arrival', 'ask', 'assistance', 'assured', 'available', 'available', 'available', 'available', 'available', 'available', 'available', 'available', 'available', 'available', 'available', 'awaits', 'awaits', 'away', 'awesome', 'back', 'back', 'back', 'backed', 'backed', 'backed', 'backed', 'backing', 'backlit', 'ba

In [127]:
# Let's inspect a specific user's result
print("\n--- for user 206749819 ---")
example_words = users_df[users_df['user_id'] == 206749819]['aggregated_words'].values[0]
print(f"Total words: {len(example_words)}") 
print(f"Unique words: {len(set(example_words))}")


--- for user 206749819 ---
Total words: 2361
Unique words: 865


# 3. Split the dataset at user level


In [128]:
from sklearn.model_selection import train_test_split

X = users_df[['user_id']]
y = users_df['occupation_code']

# First split: 70% train, 30% temp (for validation and test)
train_df, test_df, y_train, y_test = train_test_split(
    users_df, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTraining set class distribution:")
print(train_df['occupation_code'].value_counts())
print(f"\nTest set class distribution:")
print(test_df['occupation_code'].value_counts())


Training set class distribution:
occupation_code
2    1435
3     858
5     697
1     403
6     254
8     174
4     158
9     118
7      55
Name: count, dtype: int64

Test set class distribution:
occupation_code
2    359
3    215
5    174
1    101
6     64
8     44
4     40
9     29
7     13
Name: count, dtype: int64


In [129]:
print(f"\nFinal training set size: {train_df.shape}")
print(f"Final test set size: {test_df.shape}")


Final training set size: (4152, 4)
Final test set size: (1039, 4)


# Output preproceessed data to files

In [130]:
print("\nSaving data as pickle files...")

# Define the new file paths with the .pkl extension
train_path = '../data/processed/train.pkl'
test_path = '../data/processed/test.pkl'

# Save the training set using .to_pickle()
train_df.to_pickle(train_path)
print(f"Training set with {len(train_df)} rows saved to {train_path}")

# Save the test set
test_df.to_pickle(test_path)
print(f"Test set with {len(test_df)} rows saved to {test_path}")

print("\nAll files saved successfully.")


Saving data as pickle files...
Training set with 4152 rows saved to ../data/processed/train.pkl
Test set with 1039 rows saved to ../data/processed/test.pkl

All files saved successfully.
