# 03_tfidf

In [1]:
import random
import os
import torch
import numpy as np
import polars as pl
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import FastICA
from scipy.sparse import vstack
from tqdm.notebook import tqdm
import joblib
from joblib import Parallel, delayed
import contextlib

In [2]:
seed = 71

In [3]:
# Seed the same seed to all
def seed_everything(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(seed)

## Prepare data

In [4]:
# load dataset
data_dir = "../../accommodation-reviews/"

train_users = pl.read_csv(data_dir + "rectour24/train_users.csv")
train_reviews = pl.read_csv(data_dir + "preprocessed/train_reviews.csv")

valid_users = pl.read_csv(data_dir + "rectour24/val_users.csv")
valid_reviews = pl.read_csv(data_dir + "preprocessed/val_reviews.csv")

test_users = pl.read_csv(data_dir + "rectour24/test_users.csv")
test_reviews = pl.read_csv(data_dir + "preprocessed/test_reviews.csv")

In [5]:
def make_reviews_df(reviews):
    reviews = reviews.fill_null('<UNK>')
    return (
        reviews
        .with_columns(
            concat_review = pl.concat_str([
                pl.lit('<review_title>: '),
                pl.col('review_title'),
                pl.lit('\n<review_positive>: '),
                pl.col('review_positive'),
                pl.lit('\n<review_negative>: '),
                pl.col('review_negative'),
                pl.lit('\n<review_score>: '),
                pl.col('review_score'),
            ])
        )
        .select('review_id', 'accommodation_id', 'concat_review')
    )

def make_users_df(users):
    users = users.fill_null('<UNK>')
    return (
        users
        .with_columns(
            concat_features = pl.concat_str([
                pl.lit('<guest_type>: '),
                pl.col('guest_type'),
                pl.lit('\n<guest_country>: '),
                pl.col('guest_country'),
                pl.lit('\n<number_of_nights>: '),
                pl.col('room_nights'),
                pl.lit('\n<check-in_months>: '),
                pl.col('month'),
                pl.lit('\n<accommodation_type>: '),
                pl.col('accommodation_type'),
                pl.lit('\n<accommodation_country>: '),
                pl.col('accommodation_country'),
                pl.lit('\n<accommodation_star_rating>: '),
                pl.col('accommodation_star_rating'),
                pl.lit('\n<accommodation_score>: '),
                pl.col('accommodation_score'),
                pl.lit('\n<location_is_ski>: '),
                pl.col('location_is_ski'),
                pl.lit('\n<location_is_beach>: '),
                pl.col('location_is_beach'),
                pl.lit('\n<location_is_city_center>: '),
                pl.col('location_is_city_center'),
            ])
        )
        .select('user_id', 'accommodation_id', 'concat_features')
    )

In [6]:
# user
user_text = (
    pl.concat([
        make_users_df(train_users),
        make_users_df(valid_users),
        make_users_df(test_users),
    ])
)
user_text = user_text.with_row_index()

review_text = (
    pl.concat([
        make_reviews_df(train_reviews),
        make_reviews_df(valid_reviews),
        make_reviews_df(test_reviews),
    ])
)
review_text = review_text.with_row_index()

In [7]:
len_train_users = len(train_users)
len_valid_users = len(valid_users)
len_test_users = len(test_users)

len_train_reviews = len(train_reviews)
len_valid_reviews = len(valid_reviews)
len_test_reviews = len(test_reviews)

In [10]:
len(user_text) == len_train_users + len_valid_users + len_test_users, len(review_text) == len_train_reviews + len_valid_reviews + len_test_reviews

(True, True)

## TF-IDF

### user

In [11]:
vectorizer = TfidfVectorizer()
vectorizer.fit(user_text['concat_features'])

In [12]:
# check the vocabulary
all_voc = vectorizer.vocabulary_
print(len(all_voc))

576


In [15]:
vectorizer = TfidfVectorizer(vocabulary=all_voc)

vectorizer.fit(user_text.filter(pl.col('index') < len_train_users).get_column('concat_features'))

train_user_features = vectorizer.transform(user_text.filter(pl.col('index') < len_train_users).get_column('concat_features'))
valid_user_features = vectorizer.transform(user_text.filter((pl.col('index') >= len_train_users) & (pl.col('index') < len_train_users + len_valid_users)).get_column('concat_features'))
test_user_features = vectorizer.transform(user_text.filter(pl.col('index') >= len_train_users + len_valid_users).get_column('concat_features'))

In [16]:
user_dense_matrix = np.concatenate([train_user_features.toarray(), valid_user_features.toarray(), test_user_features.toarray()], axis=0)

In [17]:
user_dense_matrix.shape

(2031914, 576)

In [27]:
# ICA
ica = FastICA(n_components=100, random_state=seed)
user_text_features = ica.fit_transform(user_dense_matrix)

In [28]:
user_text_features.shape

(2031914, 100)

In [30]:
user_text = pl.concat(
    [
        user_text,
        pl.DataFrame(user_text_features, schema={f'user_text_tfidf_{i}': pl.Float64 for i in range(100)})
    ], how='horizontal'
)

In [32]:
user_text = user_text.drop('concat_features', 'index')

In [39]:
user_text.null_count().sum_horizontal()

sum
u32
0


In [45]:
# save
user_text.write_csv(data_dir + "preprocessed/user_tfidf_ica.csv")

### reviews

In [8]:
vectorizer = TfidfVectorizer()
vectorizer.fit(review_text['concat_review'])

In [9]:
# check the vocabulary
all_voc = vectorizer.vocabulary_
print(len(all_voc))

238778


In [10]:
vectorizer = TfidfVectorizer(vocabulary=all_voc)

vectorizer.fit(review_text.filter(pl.col('index') < len_train_reviews).get_column('concat_review'))

train_review_features = vectorizer.transform(review_text.filter(pl.col('index') < len_train_reviews).get_column('concat_review'))
valid_review_features = vectorizer.transform(review_text.filter((pl.col('index') >= len_train_reviews) & (pl.col('index') < len_train_reviews + len_valid_reviews)).get_column('concat_review'))
test_review_features = vectorizer.transform(review_text.filter(pl.col('index') >= len_train_reviews + len_valid_reviews).get_column('concat_review'))



In [11]:
train_review_features.shape, valid_review_features.shape, test_review_features.shape

((1628989, 238778), (203787, 238778), (199138, 238778))

In [12]:
review_sparse_matrix = vstack([train_review_features, valid_review_features, test_review_features])

In [13]:
# define parallel function
@contextlib.contextmanager
def tqdm_joblib(tqdm_object):
    """Context manager to patch joblib to report into tqdm progress bar given as argument"""
    class TqdmBatchCompletionCallback(joblib.parallel.BatchCompletionCallBack):
        def __call__(self, *args, **kwargs):
            tqdm_object.update(n=self.batch_size)
            return super().__call__(*args, **kwargs)

    old_batch_callback = joblib.parallel.BatchCompletionCallBack
    joblib.parallel.BatchCompletionCallBack = TqdmBatchCompletionCallback
    try:
        yield tqdm_object
    finally:
        joblib.parallel.BatchCompletionCallBack = old_batch_callback
        tqdm_object.close()

In [14]:
ica = FastICA(n_components=100, random_state=seed)

# Fit ICA on a smaller subset of data first
subset_size = 5000  # Adjust this depending on available memory
subset = review_sparse_matrix[:subset_size].toarray()

ica = FastICA(n_components=100, random_state=seed)
ica.fit(subset)

print("ICA fitted")

# Function to transform a batch
def transform_batch(start, end):
    return ica.transform(review_sparse_matrix[start:end].toarray())

# Process batches in parallel
batch_size = 1000
num_batches = len(review_text) // batch_size + (len(review_text) % batch_size != 0)

with tqdm_joblib(tqdm(desc="My calculation", total=num_batches)) as progress_bar:
    review_text_features = np.concatenate(
        Parallel(n_jobs=-1)(
            delayed(transform_batch)(i * batch_size, min((i + 1) * batch_size, len(review_text)))
            for i in range(num_batches)
        ),
        axis=0
    )

ICA fitted


My calculation:   0%|          | 0/2032 [00:00<?, ?it/s]

In [15]:
review_text_features.shape

(2031914, 100)

In [16]:
review_text = (
    pl.concat(
        [
            review_text,
            pl.DataFrame(review_text_features, schema={f'review_text_tfidf_{i}': pl.Float64 for i in range(100)})
        ], how='horizontal'
    )
    .drop('concat_review', 'index')
)

In [17]:
review_text.null_count().sum_horizontal()

sum
u32
0


In [19]:
# save
review_text.write_csv(data_dir + "preprocessed/review_tfidf_ica.csv")