In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import random
import yaml
import time
import datetime
import os
import sys
import warnings
from tqdm.notebook import trange, tqdm
from types import SimpleNamespace
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import Dataset, DataLoader, SequentialSampler, RandomSampler
from torch.cuda.amp import autocast, GradScaler

import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorWithPadding

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import cohen_kappa_score, confusion_matrix, ConfusionMatrixDisplay, f1_score

# from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import polars as pl
import matplotlib.pyplot as plt
# from spellchecker import SpellChecker
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import lightgbm as lgb

print(f'python version: {sys.version}') 
print(f'torch version: {torch.__version__}')
print(f'transfromers version: {transformers.__version__}')

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.simplefilter('ignore')

2024-06-15 09:39:50.027917: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-15 09:39:50.028022: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-15 09:39:50.165104: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


python version: 3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]
torch version: 2.1.2
transfromers version: 4.39.3


In [2]:
#https://gist.github.com/ihoromi4/b681a9088f348942b01711f251e5f964
def seed_everything(seed: int):    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
data_path = '/kaggle/input/learning-agency-lab-automated-essay-scoring-2/'
# train_df = pd.read_csv(data_path + "train.csv")
test_df = pd.read_csv(data_path + "test.csv")
# samp_df = pd.read_csv(data_path + "sample_submission.csv")

# Dataset

In [4]:
class AE2Dataset(Dataset):
    def __init__(self, conf, df, tokenizer, output_tokens_only=False):
        self.conf = conf
        self.full_texts = df[self.conf.train_col].reset_index(drop=True).values
        self.tokenizer = tokenizer
        self.output_tokens_only = output_tokens_only
        
        if not self.output_tokens_only:
            self.essay_ids = df['essay_id'].reset_index(drop=True).values
            self.labels = df[self.conf.target_col].reset_index(drop=True).values
            if self.conf.num_labels == 1: # regression
                self.label_dtype = torch.float
                if self.conf.criterion == 'bce':
                    self.labels = self.labels / 5.0 
            else: # classication
                self.label_dtype = torch.long
                
    def __len__(self):
        return len(self.full_texts)
    
    def __getitem__(self, idx):
        tokens = self._get_token(idx)
        if self.output_tokens_only:
            return tokens
        else:
            ids = self.essay_ids[idx]
            labels = self._get_label(idx)
        return {'tokens': tokens, 'labels': labels, 'ids': ids}
    
    def _get_token(self, idx):
        tokenized = self.tokenizer(
            self.full_texts[idx],
            add_special_tokens=True,
            max_length=self.conf.max_len,
            padding="max_length",
            truncation=True,
            return_tensors=None,
        )
        
        return {k: torch.tensor(v, dtype=torch.long) for k, v in tokenized.items()}
    
    def _get_label(self, idx):
        return torch.tensor(self.labels[idx], dtype=self.label_dtype)

# Model

In [5]:
class MeanPooling(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        last_hidden_state = backbone_outputs['last_hidden_state']
        attention_mask = inputs['attention_mask']
        #
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
class ConcatPooling(nn.Module):
    def __init__(self, pooling_last=4):
        super().__init__()
        self.pooling_last = pooling_last
        
    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
        #
        concat_pooling = torch.cat(tuple(all_hidden_states[-l] for l in range(1, self.pooling_last + 1)), -1)
        concat_pooling = concat_pooling[:, 0] # select the first one
        return concat_pooling
    
# https://www.kaggle.com/competitions/google-quest-challenge/discussion/129840    
class WeightedLayerPooling(nn.Module):
    def __init__(self, num_hidden_layers, layer_start: int = 4, layer_weights = None):
        super(WeightedLayerPooling, self).__init__()
        self.layer_start = layer_start
        self.num_hidden_layers = num_hidden_layers
        self.layer_weights = layer_weights if layer_weights is not None \
            else nn.Parameter(
                torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)
            )

    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
        #
        all_layer_embedding = all_hidden_states[self.layer_start:, :, :, :]
        weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
        weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
        return weighted_average[:, 0]
    
# https://www.kaggle.com/code/rhtsingh/utilizing-transformer-representations-efficiently
class LSTMPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_lstm):
        super().__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_lstm = hiddendim_lstm
        self.lstm = nn.LSTM(self.hidden_size, self.hiddendim_lstm, batch_first=True)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        all_hidden_states = torch.stack(backbone_outputs['hidden_states'])
        #
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out, _ = self.lstm(hidden_states, None)
        out = self.dropout(out[:, -1, :])
        return out
    
class GeMPooling(nn.Module):
    def __init__(self, dim=1, cfg=None, p=3, eps=1e-6):
        super().__init__()
        self.dim = dim
        self.p = nn.Parameter(torch.ones(1) * p)
        self.eps = eps
        self.feat_mult = 1
        # x seeems last hidden state

    def forward(self, backbone_outputs, inputs):
        # modified for cleaner code
        last_hidden_state = backbone_outputs['last_hidden_state']
        attention_mask = inputs['attention_mask']
        #
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.shape)
        last_hidden_state = (last_hidden_state.clamp(min=self.eps) * attention_mask_expanded).pow(self.p).sum(self.dim)
        ret = last_hidden_state / attention_mask_expanded.sum(self.dim).clip(min=self.eps)
        ret = ret.pow(1 / self.p)
        return ret

In [6]:
class CustomModel(nn.Module):
    def __init__(self, conf, conf_path=None):
        super().__init__()
        self.conf_path = conf_path
        self.multi_dropout = conf.multi_dropout
        if not self.conf_path:
            self.model_conf = AutoConfig.from_pretrained(conf.model_name, output_hidden_states=True)
            self.model_conf = self._set_dropout(self.model_conf)
            self.backbone = AutoModel.from_pretrained(conf.model_name, config=self.model_conf)
        else:
            self.model_conf = torch.load(self.conf_path)
            self.backbone = AutoModel.from_config(self.model_conf)
        
        if conf.gradient_checkpointing:
            self.backbone.gradient_checkpointing_enable()
            
        if conf.freeze_embeddings:
            self._freeze(self.backbone.embeddings)
            
        if conf.freeze_n_layers > 0:
            self._freeze(self.backbone.encoder.layer[: conf.freeze_n_layers])
        
        self.pooler, hidden_size = self.get_pooling_layer(conf)
        self.fc = nn.Linear(hidden_size, conf.num_labels)
        self._init_weights(self.fc)
        
        if self.multi_dropout and conf.num_labels > 1:
            self.dropouts = nn.ModuleList([nn.Dropout(0.5) for _ in range(5)])
    
    def _set_dropout(self, model_conf, ratio=0.):
        model_conf.attention_dropout = ratio
        model_conf.attention_probs_dropout_prob = ratio
        model_conf.hidden_dropout = ratio
        model_conf.hidden_dropout_prob = ratio
        
        return model_conf
    
    def _freeze(self, module):
        for parameter in module.parameters():
            parameter.require_grad = False
    
    def get_pooling_layer(self, conf):
        if conf.pooling_layer == 'mean_pooling':
            hidden_size = self.model_conf.hidden_size
            return MeanPooling(), hidden_size
        if conf.pooling_layer == 'concat_pooling':
            hidden_size = self.model_conf.hidden_size * conf.ccp_pooling_last
            return ConcatPooling(conf.ccp_pooling_last), hidden_size
        if conf.pooling_layer == 'weighted_layer_pooling':
            hidden_size = self.model_conf.hidden_size
            return WeightedLayerPooling(self.model_conf.num_hidden_layers, conf.wlp_layer_start), hidden_size
        if conf.pooling_layer == 'lstm_pooling':
            hidden_size = self.model_conf.hidden_size
            return LSTMPooling(self.model_conf.num_hidden_layers, hidden_size, conf.lstm_hidden), hidden_size
        if conf.pooling_layer == 'gem_pooling':
            hidden_size = self.model_conf.hidden_size
            return GeMPooling(), hidden_size
        else:
            raise Exception('Invalid pooling layer name')
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.model_conf.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.model_conf.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    
    def forward(self, inputs):
        backbone_outputs = self.backbone(**inputs)
        pooler_outputs = self.pooler(backbone_outputs, inputs)
        if self.multi_dropout:
            for i, dropout in enumerate(self.dropouts):
                if i == 0:
                    h = self.fc(dropout(pooler_outputs))
                else:
                    h += self.fc(dropout(pooler_outputs))

            outputs = h / len(self.dropouts)
        else:
            outputs = self.fc(pooler_outputs)
        
        return outputs

# Utils

In [7]:
def preprocess_data(df):
#     df['label'] = df['score'].copy() - 1
    df['full_text'] = df['full_text'].str.replace('\xa0', ' ')
#     df['full_text'] = df['full_text'].str.replace('\n\n', '[PARAGRAPH]')
    df['full_text'] = df['full_text'].str.strip()
    return df

def collator(inputs):
    mask_len = int(inputs['attention_mask'].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :mask_len]
    return inputs

# Infer Model

In [8]:
class InferModels:
    def __init__(self):
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
        self.models_dict = OrderedDict()
        self.lgbm_models_dict = OrderedDict()
    
    def register(self, model_name, path, fold_num_list):
        model_config_dict = self._load_config_path(path, fold_num_list)
        self.models_dict[model_name] = model_config_dict
        print(f'REGISTERED: {model_name}')

    def predict_cv(self, test_df):
        
        test_df = preprocess_data(test_df)
        hold_df = test_df['essay_id']
        
        for model_name, model_config_dict in self.models_dict.items():

            with open(model_config_dict['yaml']) as file:
                conf = SimpleNamespace(**yaml.safe_load(file))
                
            seed_everything(conf.seed)
            
            model_config_path = model_config_dict['model_config']
            tokenizer = self._get_tokenizer(model_config_dict['tokenizer'])
            
            fold_raw_predictions_list = []
            essay_id_df = test_df['essay_id']
            
            for fold_num, model_path in model_config_dict['models'].items():
                print(f'INFERENCING: {model_name}, CONFIG_EXP: {conf.exp}, FOLD: {fold_num}')
                
                model = self._load_model(conf, model_config_path, model_path, tokenizer)
                test_dataloader = self._get_test_dataloader(conf, test_df, tokenizer)
                fold_raw_predictions = self._predict(conf, model, test_dataloader)
                fold_raw_predictions_list.append(fold_raw_predictions)
            
            if conf.criterion == 'mse':
                fold_mean_raw = np.mean(fold_raw_predictions_list, axis=0) + 1
                cv_df = pd.concat([hold_df, pd.DataFrame(fold_mean_raw, columns=['raw_preds'])], axis=1)
                
            elif conf.criterion == 'ce':
                fold_mean_raw = np.mean(fold_raw_predictions_list, axis=0)
                fold_mean_raw_df = pd.DataFrame(fold_mean_raw, columns=[f'pred_class_{c}' for c in range(conf.num_labels)])
                cv_df = pd.concat([hold_df, fold_mean_raw_df], axis=1)

            self.models_dict[model_name]['cv_df'] = cv_df
            
    def ensemble(self, method='rint'):
        model_pred_df_list = [
            v['cv_df'][['essay_id'] + [f'raw_score_f{i}' for i in range(len(v['models']))]]
            for v in self.models_dict.values()
        ]
        
        essay_id_df = model_pred_df_list[0]['essay_id']
        pred_values = [np.mean(df.drop('essay_id', axis=1).values, axis=1) for df in model_pred_df_list]
        
        if method == 'rint':
            pred_mean = np.rint(np.mean(pred_values, axis=0)).clip(1, 6) # take mean across all models and clip to [1, 6]
            pred_df = pd.DataFrame(pred_mean, columns=['score'])
            
        elif isinstance(method, list) and len(method) == 5: # threshold
            pred = pd.cut(np.mean(pred_values, axis=0), [-np.inf] + method + [np.inf], labels=[1, 2, 3, 4, 5, 6])
            pred = pred.to_numpy().clip(1, 6)
            pred_df = pd.DataFrame(pred, columns=['score'])
            
        else:
            raise Exception('method is invalid')
            
        result = pd.concat([essay_id_df, pred_df], axis=1)
        result['score'] = result['score'].astype(int)
        
        return result
        
    def _predict(self, conf, model, dataloader):
        raw_predictions = []
        
        model.eval()
        model.to(self.device)

        for inputs in dataloader:
            inputs = collator(inputs)
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            with torch.no_grad():
                raw_outputs = model(inputs)
            
            outputs = self._process_outputs(conf, raw_outputs)
            raw_predictions.append(raw_outputs.view(-1) if raw_outputs.size() == torch.Size([]) else raw_outputs)

        raw_predictions = torch.cat(raw_predictions)

        return raw_predictions.detach().cpu().numpy()
    
    def _process_outputs(self, conf, outputs):
        if conf.num_labels == 1:
            ouputs = outputs.squeeze()
        else:
            outputs = outputs.softmax(1).argmax(-1)
        
        if conf.criterion == 'bce':
            outputs = outputs.sigmoid() * 5.0
            
        return outputs
    
    def _get_tokenizer(self, tokenizer_path):
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        return tokenizer
    
    def _load_config_path(self, path, fold_num_list):
        # /kaggle/input/fb-debertav3-roberta-large-seed0/exp54s0/best-epoch-fold3.pt
        config_dict = {
            'yaml': f'{path}/config.yaml',
            'model_config': list(Path(path).glob('*_config.pt'))[0].as_posix(),
            'tokenizer': f'{path}/tokenizers/',
            'models': {f: f'{path}/best_score_fold{f}.pt' for f in fold_num_list},
            'oof_df': f'{path}/oof_df.csv',
        }
        return config_dict
    
    def _load_lgbm_config_path(self, path):
        config_dict = {
            'models': f'{path}/models.bin',
            'selected_feats': f'{path}/selected_feats.yaml',
            'vectors': {
                'tf-idf': f'{path}/tfidf_vec.bin',
                'count': f'{path}/count_vec.bin'
            }
        }
        return config_dict
    
    def _load_model(self, conf, model_config_path, pretrained_model_path, tokenizer):
        model = CustomModel(conf, conf_path=model_config_path)
        model.backbone.resize_token_embeddings(len(tokenizer))
        state_dict = torch.load(pretrained_model_path, map_location=self.device)['model_state_dict']
        model.load_state_dict(state_dict)
        return model
    
    def _get_test_dataloader(self, conf, df, tokenizer, batch_num=1):
        test_dataset = AE2Dataset(conf, df, tokenizer, output_tokens_only=True)
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=32,
            num_workers=4,
            shuffle=False,
            pin_memory=True,
            drop_last=False,
        )
        return test_dataloader
        
    def print_registered_models(self):
        for model_name in self.models_dict.keys():
            print(model_name)
        for model_name in self.lgbm_models_dict.keys():
            print(model_name)

# Infer

In [9]:
tp_df = pl.read_csv('/kaggle/input/lal-aes2-create-prompt-data/train_df_with_prompt.csv')
# ko = tp_df.filter(pl.col('kaggle_only') == True)
t = tp_df.with_columns(pl.col('full_text').str.split('.', inclusive=True).alias('discourse_text')).explode('discourse_text')
t = t.with_columns(pl.col('discourse_text').str.strip_chars_end()).filter(pl.col('discourse_text') != '').filter(pl.col('discourse_text') != '.')

new = []
for name, data in t.group_by('essay_id', maintain_order=True):
    for i in range(data.shape[0]):
        new.append(data['essay_id'][i] + f'_{i}')
        
t = t.with_columns(essay_id=pl.Series(new))

In [10]:
t

essay_id,full_text,prompt_name,score,kaggle_only,discourse_text
str,str,str,i64,bool,str
"""000d118_0""","""Many people ha…","""Car-free citie…",3,false,"""Many people ha…"
"""000d118_1""","""Many people ha…","""Car-free citie…",3,false,""" The thing the…"
"""000d118_2""","""Many people ha…","""Car-free citie…",3,false,""" Street parkig…"
"""000d118_3""","""Many people ha…","""Car-free citie…",3,false,""" You probaly w…"
"""000d118_4""","""Many people ha…","""Car-free citie…",3,false,""" The vauban pe…"
…,…,…,…,…,…
"""fffed3e_7""","""Venus is worth…","""Exploring Venu…",2,true,""" if a human is…"
"""fffed3e_8""","""Venus is worth…","""Exploring Venu…",2,true,""" Just like wh…"
"""fffed3e_9""","""Venus is worth…","""Exploring Venu…",2,true,""", But that onl…"
"""fffed3e_10""","""Venus is worth…","""Exploring Venu…",2,true,""" Now they are …"


In [11]:
infer_models = InferModels()
# infer_models.register('exp080', '/kaggle/input/lal-aes2-exps/exp080', [0, 1, 2, 3])
infer_models.register('expD012', '/kaggle/input/lal-aes2-discourse/expD012', [0, 1, 2, 3])
# infer_models.register('exp098', '/kaggle/input/lal-aes2-train', [0])


# infermodels.register_lgbm('feats+exp015+exp022', '/kaggle/input/lal-ae2-lgbm-expnb')

# infermodels.print_registered_models()

REGISTERED: expD012


In [12]:
infer_models.predict_cv(t.to_pandas())

# pred_df = infer_models.ensemble()

# if submit:
#     display(pred_df)
#     pred_df.to_csv('submission.csv', index=False)

# lgbm_pred_df = infermodels.predict_cv_ensemble_lgbm(test_df)
# if submit:
#     display(lgbm_pred_df)
#     lgbm_pred_df.to_csv('submission.csv', index=False)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFERENCING: expD012, CONFIG_EXP: D012, FOLD: 0
INFERENCING: expD012, CONFIG_EXP: D012, FOLD: 1
INFERENCING: expD012, CONFIG_EXP: D012, FOLD: 2
INFERENCING: expD012, CONFIG_EXP: D012, FOLD: 3


In [13]:
display(infer_models.models_dict['expD012']['cv_df'])

Unnamed: 0,essay_id,pred_class_0,pred_class_1,pred_class_2,pred_class_3,pred_class_4,pred_class_5,pred_class_6
0,000d118_0,1.348422,-2.039210,2.770183,2.257207,-2.593693,-1.103083,-2.095595
1,000d118_1,-2.072755,-2.387809,0.129768,3.764541,-1.821408,-0.223503,0.909058
2,000d118_2,-1.693010,-1.473693,1.431878,3.769820,-1.916697,-2.047137,-1.346176
3,000d118_3,-1.512254,-3.646935,-0.515993,6.056010,-1.528462,-1.853261,-0.455074
4,000d118_4,-2.831292,-1.682352,0.927727,3.703798,-1.544700,-1.427290,0.498887
...,...,...,...,...,...,...,...,...
326048,fffed3e_7,-3.155074,-2.279390,3.667197,3.379212,-1.769132,-2.329039,-0.179665
326049,fffed3e_8,-3.028397,-1.810841,3.698041,3.415874,-1.974055,-1.822579,-0.801626
326050,fffed3e_9,-3.198970,-1.617192,1.122726,1.032075,-1.675161,0.400315,3.448996
326051,fffed3e_10,-3.581976,-0.506038,2.807937,1.159544,-0.863772,-0.936937,1.046222


In [14]:
result = t.join(pl.from_pandas(infer_models.models_dict['expD012']['cv_df']), on='essay_id')
result

essay_id,full_text,prompt_name,score,kaggle_only,discourse_text,pred_class_0,pred_class_1,pred_class_2,pred_class_3,pred_class_4,pred_class_5,pred_class_6
str,str,str,i64,bool,str,f32,f32,f32,f32,f32,f32,f32
"""000d118_0""","""Many people ha…","""Car-free citie…",3,false,"""Many people ha…",1.348422,-2.03921,2.770183,2.257207,-2.593693,-1.103083,-2.095595
"""000d118_1""","""Many people ha…","""Car-free citie…",3,false,""" The thing the…",-2.072755,-2.387809,0.129768,3.764541,-1.821408,-0.223503,0.909058
"""000d118_2""","""Many people ha…","""Car-free citie…",3,false,""" Street parkig…",-1.69301,-1.473693,1.431878,3.76982,-1.916697,-2.047137,-1.346176
"""000d118_3""","""Many people ha…","""Car-free citie…",3,false,""" You probaly w…",-1.512254,-3.646935,-0.515993,6.05601,-1.528462,-1.853261,-0.455074
"""000d118_4""","""Many people ha…","""Car-free citie…",3,false,""" The vauban pe…",-2.831292,-1.682352,0.927727,3.703798,-1.5447,-1.42729,0.498887
…,…,…,…,…,…,…,…,…,…,…,…,…
"""fffed3e_7""","""Venus is worth…","""Exploring Venu…",2,true,""" if a human is…",-3.155074,-2.27939,3.667197,3.379212,-1.769132,-2.329039,-0.179665
"""fffed3e_8""","""Venus is worth…","""Exploring Venu…",2,true,""" Just like wh…",-3.028397,-1.810841,3.698041,3.415874,-1.974055,-1.822579,-0.801626
"""fffed3e_9""","""Venus is worth…","""Exploring Venu…",2,true,""", But that onl…",-3.19897,-1.617192,1.122726,1.032075,-1.675161,0.400315,3.448996
"""fffed3e_10""","""Venus is worth…","""Exploring Venu…",2,true,""" Now they are …",-3.581976,-0.506038,2.807937,1.159544,-0.863772,-0.936937,1.046222


In [15]:
# result.write_csv('all_dt_pred.csv')
result.write_parquet('all_dt_pred.parquet')

In [16]:
# infer_models.models_dict['expD002']['cv_df'].to_csv('dt_pred.csv', index=False)
# t.write_csv('ko_sen.csv')