### 1. Load the GloVe pre-trained embeddings

In [32]:
import os
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd

def load_glove_model(glove_file):
    """
    Loads the GloVe word vectors from a specified file into a dictionary.
    """
    print(f"Loading GloVe model from {glove_file}...")
    model = {}
    
    file_size = os.path.getsize(glove_file)
    
    with open(glove_file, 'r', encoding="utf-8") as f:
        with tqdm(total=file_size, unit='B', unit_scale=True, desc=f"Loading {os.path.basename(glove_file)}") as pbar:
            for line in f:
                split_line = line.split()
                word = split_line[0]
                # The rest of the line is the vector
                embedding = np.array([float(val) for val in split_line[1:]])
                model[word] = embedding
                
                pbar.update(len(line.encode('utf-8')))
                
    print(f"Loading complete. Total of {len(model)} words loaded!")
    return model

In [33]:
glove_path = "../data/GloVe/glove.twitter.27B.50d.txt"

# Load the 50-dimensional Twitter GloVe model
glove_twitter_model = load_glove_model(glove_path)
embedding_dim = 50

Loading GloVe model from ../data/GloVe/glove.twitter.27B.50d.txt...


Loading glove.twitter.27B.50d.txt:   0%|          | 0.00/511M [00:00<?, ?B/s]

Loading complete. Total of 1193514 words loaded!


### 2. Load the preprocessed data

In [34]:
print("Loading data from CSV files...")

train_path = '../data/processed/train.pkl'
test_path = '../data/processed/test.pkl'

# Load the training set
train_df = pd.read_pickle(train_path)
print(f"Loaded 'train.csv' with {len(train_df)} rows.")

# Load the test set
test_df = pd.read_pickle(test_path)
print(f"Loaded 'test.csv' with {len(test_df)} rows.")

# --- Verify one of the loaded DataFrames ---
print("\nVerifying the first 5 rows of the loaded training data:")
train_df.head()

Loading data from CSV files...
Loaded 'train.csv' with 4152 rows.
Loaded 'test.csv' with 1039 rows.

Verifying the first 5 rows of the loaded training data:


Unnamed: 0,user_id,occupation_code,category,aggregated_words
5127,265383481,8,"Process, Plant and Machine Operatives","[abandoned, abilities, able, able, able, able,..."
3607,22364420,5,Skilled Trades Occupations,"[abandoned, abiding, ability, ability, able, a..."
1689,16797684,2,Professional Occupations,"[ability, ability, able, able, absolutely, acc..."
4942,14871013,3,Associate Professional and Technical Occupations,"[abandon, abilities, ability, ability, able, a..."
4317,75687820,6,"Caring, Leisure and Other Service Occupations","[ability, ability, ability, absolutely, abuse,..."


### 3. Extract features using GloVe embeddings and save to output

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tf_idf(df):
    print("--- Starting TF-IDF Weighted Feature Extraction ---")
    
    df_copy = df.copy()
        
    # Step 1: Convert word lists to single strings for the vectorizer
    # The TfidfVectorizer expects documents as strings, not lists of words.
    df_copy['aggregated_text'] = df_copy['aggregated_words'].str.join(' ')
    
    # Step 2: Fit TfidfVectorizer on your text documents
    # IMPORTANT: In your real project, you MUST fit this on your TRAINING data only.
    print("Fitting TfidfVectorizer...")
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_vectorizer.fit(df_copy['aggregated_text'])
    
    # Create a dictionary mapping words to their IDF scores
    word_idf_weights = dict(zip(tfidf_vectorizer.get_feature_names_out(), tfidf_vectorizer.idf_))
    print("TfidfVectorizer fit complete.")
    
    return word_idf_weights

In [36]:
def create_tfidf_weighted_vector(word_list, model, idf_weights, embed_dim=50):
        """Calculates the TF-IDF weighted mean vector for a list of words."""
        vectors = []
        weights = []

        for word in word_list:
            if word in model and word in idf_weights:
                vectors.append(model[word])
                # The weight is the word's IDF score
                weights.append(idf_weights[word])

        if not vectors:
            return np.zeros(embed_dim)

        # Calculate the weighted average of the vectors
        weighted_mean_vector = np.average(vectors, axis=0, weights=weights)
        
        return weighted_mean_vector

In [37]:
train_path_pkl = '../data/extracted/train.pkl'
test_path_pkl = '../data/extracted/test.pkl'

print("\nCreating TF-IDF weighted feature vectors...")

for df, save_path in zip([train_df, test_df], [train_path_pkl, test_path_pkl]):
    print(save_path)
    tqdm.pandas(desc="Creating TF-IDF Vectors")
    
    word_idf_weights = tf_idf(df)

    df['fv'] = df['aggregated_words'].progress_apply(
        lambda words: create_tfidf_weighted_vector(words, glove_twitter_model, word_idf_weights, embedding_dim))
        
    print("\nSaving data using Pickle...")
    df.to_pickle(save_path)
    print(f"Dataset with {len(df)} rows saved to {save_path}")
    
print("Processing complete.")


Creating TF-IDF weighted feature vectors...
../data/extracted/train.pkl
--- Starting TF-IDF Weighted Feature Extraction ---
Fitting TfidfVectorizer...
TfidfVectorizer fit complete.


Creating TF-IDF Vectors:   0%|          | 0/4152 [00:00<?, ?it/s]


Saving data using Pickle...
Dataset with 4152 rows saved to ../data/extracted/train.pkl
../data/extracted/test.pkl
--- Starting TF-IDF Weighted Feature Extraction ---
Fitting TfidfVectorizer...
TfidfVectorizer fit complete.


Creating TF-IDF Vectors:   0%|          | 0/1039 [00:00<?, ?it/s]


Saving data using Pickle...
Dataset with 1039 rows saved to ../data/extracted/test.pkl
Processing complete.
