In [1]:
%%writefile cluster.py
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
import cupy as cp
#from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
#from sklearn.neighbors import NearestNeighbors
#%env TOKENIZERS_PARALLELISM=false
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from pathlib import Path
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    print_freq = 500
    num_workers = 4
    uns_model = "/kaggle/input/lecr-trained-models5/LECR_P12/xlm-roberta-large-exp_fold2_epochs20-20230314T012337Z-002/xlm-roberta-large-exp_fold2_epochs20/"
    sup_model = uns_model
    sup_model_tuned = "/kaggle/input/lecr-p12-cls-model/xlm_large_fold2_42_content_top15.pth"
    uns_tokenizer = AutoTokenizer.from_pretrained(uns_model)
    sup_tokenizer = AutoTokenizer.from_pretrained(sup_model)
    gradient_checkpointing = False
    batch_size = 8
    n_folds = 5
    top_n = 15
    seed = 42
    threshold = 0.11
    uns_max_len = 128
    sup_max_len = 256
    
# =========================================================================================
# Data Loading
# =========================================================================================
def read_data():
    class Topic:
        def __init__(self, topic_id):
            self.id = topic_id

        @property
        def parent(self):
            parent_id = topics_df.loc[self.id].parent
            if pd.isna(parent_id):
                return None
            else:
                return Topic(parent_id)

        @property
        def ancestors(self):
            ancestors = []
            parent = self.parent
            while parent is not None:
                ancestors.append(parent)
                parent = parent.parent
            return ancestors

        @property
        def siblings(self):
            if not self.parent:
                return []
            else:
                return [topic for topic in self.parent.children if topic != self]

        @property
        def content(self):
            if self.id in correlations_df.index:
                return [ContentItem(content_id) for content_id in correlations_df.loc[self.id].content_ids.split()]
            else:
                return tuple([]) if self.has_content else []

        def get_breadcrumbs(self, separator=" >> ", include_self=True, include_root=True):
            ancestors = self.ancestors
            if include_self:
                ancestors = [self] + ancestors
            if not include_root:
                ancestors = ancestors[:-1]
            return separator.join(reversed([a.title for a in ancestors]))

        @property
        def children(self):
            return [Topic(child_id) for child_id in topics_df[topics_df.parent == self.id].index]

        def subtree_markdown(self, depth=0):
            markdown = "  " * depth + "- " + self.title + "\n"
            for child in self.children:
                markdown += child.subtree_markdown(depth=depth + 1)
            for content in self.content:
                markdown += ("  " * (depth + 1) + "- " + "[" + content.kind.title() + "] " + content.title) + "\n"
            return markdown

        def __eq__(self, other):
            if not isinstance(other, Topic):
                return False
            return self.id == other.id

        def __getattr__(self, name):
            return topics_df.loc[self.id][name]

        def __str__(self):
            return self.title

        def __repr__(self):
            return f"<Topic(id={self.id}, title=\"{self.title}\")>"


    class ContentItem:
        def __init__(self, content_id):
            self.id = content_id

        @property
        def topics(self):
            return [Topic(topic_id) for topic_id in topics_df.loc[correlations_df[correlations_df.content_ids.str.contains(self.id)].index].index]

        def __getattr__(self, name):
            return content_df.loc[self.id][name]

        def __str__(self):
            return self.title

        def __repr__(self):
            return f"<ContentItem(id={self.id}, title=\"{self.title}\")>"

        def __eq__(self, other):
            if not isinstance(other, ContentItem):
                return False
            return self.id == other.id

        def get_all_breadcrumbs(self, separator=" >> ", include_root=True):
            breadcrumbs = []
            for topic in self.topics:
                new_breadcrumb = topic.get_breadcrumbs(separator=separator, include_root=include_root)
                if new_breadcrumb:
                    new_breadcrumb = new_breadcrumb + separator + self.title
                else:
                    new_breadcrumb = self.title
                breadcrumbs.append(new_breadcrumb)
            return breadcrumbs
        
    data_dir = Path('/kaggle/input/learning-equality-curriculum-recommendations')
    topics_df = pd.read_csv(data_dir / "topics.csv").fillna({"title": "", "description": ""})
    content_df = pd.read_csv(data_dir / "content.csv", index_col=0).fillna("")
    sample_submission = pd.read_csv(data_dir / 'sample_submission.csv')
    # Merge topics with sample submission to only infer test topics
    topics = topics_df.merge(sample_submission, how = 'inner', left_on = 'id', right_on = 'topic_id').set_index('id')
    topics_df = topics_df.set_index('id')

    topic_id_texts = []
    content_id_texts = []
    for topic_idx in tqdm(topics.index):
        tmp_topic = Topic(topic_idx)
        children = tmp_topic.children
        child = "" if len(children)==0 else children[0].description

        parent = tmp_topic.parent
        par = "" if parent is None else parent.description
        topic_repre = f"[{tmp_topic.language}, {tmp_topic.level}] {tmp_topic.title} {tmp_topic.description} {tmp_topic.get_breadcrumbs()} {child} {par}"
        topic_language = f"{tmp_topic.language}"
        topic_id_texts.append((topic_idx, topic_repre, topic_language))

    for content_idx in tqdm(content_df.index):
        ct = ContentItem(content_idx)
        content_repre = f"{ct.title} {ct.description} {ct.text}"
        content_language = f"{ct.language}"
        content_id_texts.append((content_idx, content_repre, content_language))

    topics = pd.DataFrame(data={'id':[item[0] for item in topic_id_texts], 
                             'title':[item[1] for item in topic_id_texts],
                               'language':[item[2] for item in topic_id_texts],})
    
    content = pd.DataFrame(data={'id':[item[0] for item in content_id_texts], 
                                 'title':[item[1] for item in content_id_texts],
                                "language":[item[2] for item in content_id_texts],})

    del topics_df, content_df, sample_submission, topic_id_texts, content_id_texts
    gc.collect()

    print(' ')
    print('-' * 50)
    print(f"topics.shape: {topics.shape}")
    print(f"content.shape: {content.shape}")

    
    return topics, content

# =========================================================================================
# Prepare input, tokenize
# =========================================================================================

def prepare_uns_input(text, cfg):
    inputs = cfg.uns_tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length=128,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs
# =========================================================================================
# Unsupervised dataset
# =========================================================================================
class uns_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['title'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_uns_input(self.texts[item], self.cfg)
        return inputs
    
# =========================================================================================
# Prepare input, tokenize
# ========================================================================================
# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

# =========================================================================================
# Unsupervised model
# =========================================================================================
class uns_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.uns_model)
        self.model = AutoModel.from_pretrained(cfg.uns_model, config = self.config)
        self.pool = MeanPooling()
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        return feature
    
# =========================================================================================
# Get embeddings
# =========================================================================================
def get_embeddings(loader, model, device):
    model.eval()
    preds = []
    for step, inputs in enumerate(tqdm(loader)):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.to('cpu').numpy())
    preds = np.concatenate(preds)
    return preds


# =========================================================================================
# Build our inference set
# =========================================================================================
def build_inference_set(topics, content, cfg):
    # Create lists for training
    topics_ids = []
    content_ids = []
    topics_languages = []
    content_languages = []
    title1 = []
    title2 = []
    # Iterate over each topic
    for k in tqdm(range(len(topics))):
        row = topics.iloc[k]
        topics_id = row['id']
        topics_language = row['language']
        topics_title = row['title']
        predictions = row['predictions'].split(' ')
        for pred in predictions:
            content_title = content.loc[pred, 'title']
            content_language = content.loc[pred, 'language']
            topics_ids.append(topics_id)
            content_ids.append(pred)
            title1.append(topics_title)
            title2.append(content_title)
            topics_languages.append(topics_language)
            content_languages.append(content_language)
    # Build training dataset
    test = pd.DataFrame(
        {'topics_ids': topics_ids, 
         'content_ids': content_ids, 
         'title1': title1, 
         'title2': title2,
         'topic_language': topics_languages, 
         'content_language': content_languages, 
        }
    )
    # Release memory
    del topics_ids, content_ids, title1, title2, topics_languages, content_languages
    gc.collect()
    
    return test
    
# =========================================================================================
# Get neighbors
# =========================================================================================
def get_neighbors(topics, content, cfg):
    # Create topics dataset
    topics_dataset = uns_dataset(topics, cfg)
    # Create content dataset
    content_dataset = uns_dataset(content, cfg)
    # Create topics and content dataloaders
    topics_loader = DataLoader(
        topics_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.uns_tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    content_loader = DataLoader(
        content_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.uns_tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
        )
    # Create unsupervised model to extract embeddings
    model = uns_model(cfg)
    model.to(device)
    # Predict topics
    topics_preds = get_embeddings(topics_loader, model, device)
    content_preds = get_embeddings(content_loader, model, device)
    # Transfer predictions to gpu
    topics_preds_gpu = cp.array(topics_preds)
    content_preds_gpu = cp.array(content_preds)
    # Release memory
    
    del topics_dataset, content_dataset, topics_loader, content_loader, topics_preds, content_preds, model
    gc.collect()
    torch.cuda.empty_cache()
    # KNN model
    print(' ')
    print('Training KNN model...')
    neighbors_model = NearestNeighbors(n_neighbors = cfg.top_n, metric = 'cosine')
    neighbors_model.fit(content_preds_gpu)
    indices = neighbors_model.kneighbors(topics_preds_gpu, return_distance = False)
    predictions = []
    for k in range(len(indices)):
        pred = indices[k]
        p = ' '.join([content.loc[ind, 'id'] for ind in pred.get()])
        predictions.append(p)
    topics['predictions'] = predictions
    # Release memory
    del topics_preds_gpu, content_preds_gpu, neighbors_model, predictions, indices
    gc.collect()
    return topics, content 

# =========================================================================================
# Process test
# =========================================================================================
def preprocess_test(test):
    test['title1'].fillna("Title does not exist", inplace = True)
    test['title2'].fillna("Title does not exist", inplace = True)
    # Create feature column
    test['text'] = test['title1'] + '[SEP]' + test['title2']
    # Drop titles
    test.drop(['title1', 'title2'], axis = 1, inplace = True)
    # Sort so inference is faster
    test['length'] = test['text'].apply(lambda x: len(x))
    test.sort_values('length', inplace = True)
    test.drop(['length'], axis = 1, inplace = True)
    test.reset_index(drop = True, inplace = True)
    gc.collect()
    return test
# =========================================================================================   
# Read data
topics, content = read_data()
if len(topics)==5:
    content = content.head(1000)
# Run nearest neighbors
topics, content = get_neighbors(topics, content, CFG)
gc.collect()
# Set id as index for content
content.set_index('id', inplace = True)
# Build training set
test = build_inference_set(topics, content, CFG)
# Process test set
test = preprocess_test(test)
test.to_pickle('test.pkl')

Writing cluster.py


In [2]:
!python cluster.py

100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 109.32it/s]
100%|█████████████████████████████████| 154047/154047 [00:43<00:00, 3549.02it/s]
 
--------------------------------------------------
topics.shape: (5, 3)
content.shape: (154047, 3)
100%|█████████████████████████████████████████████| 1/1 [00:02<00:00,  2.16s/it]
100%|█████████████████████████████████████████| 125/125 [00:15<00:00,  8.32it/s]
 
Training KNN model...
100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 1808.67it/s]


In [3]:
%%writefile classify1.py
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
import cupy as cp
#from cuml.metrics import pairwise_distances
from cuml.neighbors import NearestNeighbors
#from sklearn.neighbors import NearestNeighbors
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test = pd.read_pickle('test.pkl')
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    print_freq = 500
    num_workers = 4
    uns_model = "/kaggle/input/lecr-trained-models5/LECR_P12/xlm-roberta-large-exp_fold2_epochs20-20230314T012337Z-002/xlm-roberta-large-exp_fold2_epochs20/"
    sup_model = uns_model
    sup_model_tuned = "/kaggle/input/lecr-p12-cls-model/xlm_large_fold2_42_content_top15.pth"
    uns_tokenizer = AutoTokenizer.from_pretrained(uns_model)
    sup_tokenizer = AutoTokenizer.from_pretrained(sup_model)
    gradient_checkpointing = False
    batch_size = 8
    n_folds = 5
    top_n = 15
    seed = 42
    threshold = 0.11
    uns_max_len = 128
    sup_max_len = 256
    
def prepare_sup_input(text, cfg):
    inputs = cfg.sup_tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length=256,
        truncation=True,
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs
# =========================================================================================
# Supervised dataset
# =========================================================================================
class sup_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_sup_input(self.texts[item], self.cfg)
        return inputs
    
    
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.sup_model, output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = AutoModel.from_pretrained(cfg.sup_model, config = self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total = len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
    predictions = np.concatenate(preds)
    return predictions
# Inference
# =========================================================================================
def inference(test, cfg):
    # Create dataset and loader
    test_dataset = sup_dataset(test, cfg)
    test_loader = DataLoader(
        test_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        collate_fn = DataCollatorWithPadding(tokenizer = cfg.sup_tokenizer, padding = 'longest'),
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    # Get model
    model = custom_model(cfg)
    # Load weights
    state = torch.load(cfg.sup_model_tuned, map_location = torch.device('cpu'))
    model.load_state_dict(state['model'])
    prediction = inference_fn(test_loader, model, device)
    # Release memory
    torch.cuda.empty_cache()
    del test_dataset, test_loader, model, state
    gc.collect()
    # Use threshold
    test['prediction'] = prediction
    test['predictions_binary'] = (prediction>CFG.threshold).astype(int)
    #test = test[test['topic_language'] == test['content_language']].reset_index(drop=True)
    test['lang'] = (test['topic_language'] == test['content_language']).astype('int8')
    del test['topic_language'], test['content_language']
    gc.collect()
    test = test.sort_values(['lang', 'prediction'], ascending = False)
    
    predicted = test[(test.predictions_binary == 1)&(test['lang'] == 1)].groupby(['topics_ids'])['content_ids'].agg(list).reset_index()
    
    #no_pos = test.groupby(['topics_ids']).head(1)
    #no_pos = no_pos[no_pos.predictions_binary==0].groupby(['topics_ids'])['content_ids'].agg(list).reset_index()
    #predicted = pd.concat([predicted,no_pos]).reset_index(drop=True)
    predicted['content_ids'] = predicted['content_ids'].apply(lambda x: ' '.join(x))
    predicted.columns = ['topic_id', 'content_ids']
    test0 = pd.Series(test['topics_ids'].unique())
    test0 = test0[~test0.isin(predicted['topic_id'])]
    
    test0 = pd.DataFrame({'topic_id': test0.values, 'content_ids': ""})
    predicted = pd.concat([predicted, test0], axis = 0, ignore_index = True)
    predicted.to_csv('submission.csv', index = False)
    return predicted

# Inference
test_r = inference(test, CFG)
test_r.head()

Writing classify1.py


In [4]:
!python classify1.py

100%|███████████████████████████████████████████| 10/10 [00:04<00:00,  2.45it/s]


In [5]:
import pandas as pd
pd.read_csv('submission.csv')

Unnamed: 0,topic_id,content_ids
0,t_4054df11a74e,
1,t_0006d41a73a8,
2,t_00069b63a70a,
3,t_00004da3a1b2,
4,t_00068291e9a4,
