In [31]:
# Uncomment below section and run in case of re-connecting Colab

# !pip install git+https://github.com/haven-jeon/PyKoSpacing.git
# !pip install transformers
# !pip install git+https://github.com/ssut/py-hanspell.git
# !pip install py-hanspell

# from google.colab import drive
# drive.mount('/content/drive')

# %cd drive/MyDrive/MBTI
# !pwd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import os
import re

import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from pykospacing import Spacing
from hanspell import spell_checker
from transformers import AutoTokenizer

pd.set_option('display.width', 180)

#TODO:
# * train / text 다른 로직이 필요 (데이터 형식이 조금 다름)
# * 다른 Text preprocess 방식 도입 검토

class ddDataset(Dataset):
    def __init__(
        self,
        data_path: str,
        question_path: str,
        pretrained_url: str = "klue/bert-base",
        target_mbti: str = None,
        txt_preprocess: bool = True,
        normalize: bool = True,
        is_binary_classification: bool = True,
        is_bert: bool = True,
        is_train: bool = True
        ):
        """DataLoader for MBTI dataset

        Args:
            data_path (str): Data file path. Both csv and parguet files are allowed.
            question_path (str): Question file path. Both csv and parguet files are allowed.
            target_mbti (str): Target mbti for binary classification.
            txt_preprocess (bool, optional): Text preprocessing pipeline. (e.g. fixing grammar, removing punctuations). Defaults to True.
            normalize (bool, optional): Normalize numeric attribute. Defaults to True.
            is_binary_classification (bool, optional): Target of task. You can choose btw Multi-class classificaiton
                and 4 binary classification problem. Defaults to True.
            is_bert (bool, optional): Using BERT for language model or not. Defaults to True.
            is_train (bool, optional): Whether given data is for training or not. Defaults to True.
        """

        data = pd.read_csv(data_path) if data_path.endswith('.csv') else pd.read_parquet(data_path)
        self.question_data = pd.read_csv(question_path)

        print("data :", data.head())
        print("--------origin---------")
        data_origin = data.copy()

        # preprocess data
        if txt_preprocess:
            # data['Answer'] = data['Answer'].apply(self.fix_grammar)
            self.preprocess_text(data)
        if normalize:
            data['Age'] = (data['Age'] - data['Age'].mean()) / data['Age'].std()

        print("--------preprocess---------")

        # align dataset with binary classification (only for training data - test data doesn't contain 'MBTI' field)
        label_col = None
        if is_train and is_binary_classification:
            label_col = self.prepare_binary_classification(data, target_mbti)
            l_list = label_col.split('/')
            assert data[label_col].value_counts()[0] == \
                  data[label_col].value_counts()[1]

        print("data :", data.head())
        print("--------prepare_binary_classification---------")

        # prepare for language model
        #TODO: tokenizer class 인자로 넣어야 함 & tokenizer 만으로 [CLS], [SEP] 잘 붙는지 확인 필요
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_url)
        data = self.tokenize(data)


        selected_question = self.question_data.iloc[data_origin['Q_number'][0] - 1].Question
        selected_answer = data_origin['Answer'][0]
        print(selected_question)
        print(selected_answer)
        encoding = self.tokenizer(selected_question, selected_answer)
        print('input_ids :', encoding['input_ids'])
        print('token_type_ids :', encoding['token_type_ids'])
        print('attention_mask :', encoding['attention_mask'])
        decoding = self.tokenizer.convert_ids_to_tokens(encoding['input_ids'])
        print(decoding)

        print("data :", data.head())
        print("--------tokenize---------")

        # select columns for both training and inference
        #TODO: 테스트 데이터를 고려해서 유저 정보를 학습에 활용하지 않는 상황. 필요시 고쳐야 함.
        self.cat_col    = ['Gender']
        self.num_col    = ['Age']
        self.label_col  = label_col
        self.data       = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        #TODO:
        # * getitem --> BERT input dimension 확인 후 변환 & 다른 features 들과 붙여서 input instance 생성

        selected_data = self.data.iloc[idx]

        input_ids      = torch.tensor(selected_data['QandA']['input_ids'])
        token_type_ids = torch.tensor(selected_data['QandA']['token_type_ids'])
        attention_mask = torch.tensor(selected_data['QandA']['attention_mask'])
        label          = torch.tensor(selected_data[self.label_col])                        # [batch size   x   1]

        # txt_input = (input_ids, token_type_ids, attention_mask)                             # [batch size   x   sequence length]  <-- input_ids (actual input)
        cat_input = torch.tensor(selected_data[self.cat_col])                               # [batch size   x   # categorical features]
        num_input = torch.tensor(selected_data[self.num_col])                               # [batch size   x   # numerical features]

        # sample = {"txt_input": txt_input, "cat_input" : cat_input, "num_input" :num_input, "label": label}

        sample = selected_data['QandA']
        sample['cat_input'] = cat_input
        sample['num_input'] = num_input
        sample['label'] = label

        print('original len :', len(sample['input_ids']))

        return sample

    # ======================
    #    Helper Functions
    # ======================

    def fix_grammar(self, answer: str) -> str:
        answer = spell_checker.check(answer)
        return answer.checked

    def fix_spacing(self, answer: str) -> str:
        answer = answer.replace(" ", '')
        spacing = Spacing()
        return spacing(answer)

    def remove_punctuation(self, answer: str) -> str:
        #FIXME: remove punctuation 검증 필요
        answer = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '', answer)
        answer = re.sub(r"^\s+", '', answer)                    # remove space from start
        answer = re.sub(r'\s+$', '', answer)                    # remove space from the end
        return answer

    def preprocess_text(self, data: pd.DataFrame):
        # data['Answer'] = data['Answer'].apply(self.fix_grammar)
        data['Answer'] = data['Answer'].apply(self.fix_spacing)
        data['Answer'] = data['Answer'].apply(self.remove_punctuation)

    def prepare_binary_classification(self, data: pd.DataFrame, target_mbti: str) -> str:
        t_value, f_value = 1, 0
        target_mbti = target_mbti.upper()
        if target_mbti not in ['E', 'N', 'F', 'P']:
            if target_mbti in ['I', 'S', 'T', 'J']:
                t_value, f_value = 0, 1
            else:
                raise ValueError ("Wrong mbti type. Try different type instead.")

        data['MBTI'] = data['MBTI'].str     \
            .contains(target_mbti)          \
                .replace({True: t_value, False: f_value})

        col_name = None
        if target_mbti in ('E', 'I'):
            col_name = 'I/E'
        elif target_mbti in ('N', 'S'):
            col_name = 'S/N'
        elif target_mbti in ('F', 'T'):
            col_name = 'T/F'
        else:
            col_name = 'J/P'
        data.rename(columns = {'MBTI':col_name}, inplace=True)
        
        return col_name

    def tokenize(self, data: pd.DataFrame):

        def tokenize_per_sentence(series: pd.Series) -> str:
            selected_question = self.question_data.iloc[series['Q_number'] - 1].Question
            selected_answer = series['Answer']
            
            return self.tokenizer(selected_question, selected_answer)
        
        data['QandA'] =  data.apply(tokenize_per_sentence, axis=1)
        data = data.drop(labels='Answer', axis=1)
        return data

In [None]:
data_path = "data/example_train.csv"
question_path = "data/question_filtered.csv"

data = ddDataset(
   data_path,
   question_path,
   target_mbti = "E"
)

data :    Data_ID  User_ID  Gender  Age  MBTI  Q_number                                             Answer
0        1        1       1   30  INFP         1  <아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...
1        2        1       1   30  INFP         2  <중립>  다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하...
2        3        1       1   30  INFP         3  <그렇다> 감정 이입이 잘되어 코미디 영화에서 사람이 울고 있을 때도 울기 때문에 ...
3        4        1       1   30  INFP         4  <중립> 대비책을 세우긴 하는데 세우다가 마는 편입니다. 일의 변수가 생길 수 있고...
4        5        1       1   30  INFP         5  <아니다> 평정심을 유지 못 하는 편입니다. 머릿속은 백지화가 된 상태로 말도 제대...
--------origin---------
--------preprocess---------
data :    Data_ID  User_ID  Gender       Age  I/E  Q_number                                             Answer
0        1        1       1 -0.994778    0         1  <아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...
1        2        1       1 -0.994778    0         2  <중립> 다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하며...
2        3        1   

In [None]:
qanda = data.data.iloc[0]
print(qanda['QandA'])
print(torch.tensor(qanda['QandA']['input_ids']))
print(qanda[['Gender']])

{'input_ids': [2, 7267, 11187, 3824, 3949, 2138, 4577, 2075, 2182, 35, 4043, 2069, 9695, 2051, 3072, 2069, 904, 19093, 2522, 3626, 2470, 3782, 2116, 5333, 2097, 2182, 18, 3, 32, 3614, 2062, 34, 8115, 904, 18149, 6391, 4043, 2052, 1513, 2088, 4993, 3746, 2069, 6992, 2118, 1380, 2015, 3624, 2170, 6700, 2079, 3949, 2522, 10873, 26775, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
tensor([    2,  7267, 11187,  3824,  3949,  2138,  4577,  2075,  2182,    35,
         4043,  2069,  9695,  2051,  3072,  2069,   904, 19093,  2522,  3626,
         2470,  3782,  2116,  5333,  2097,  2182,    18,     3,    32,  3614,
         2062,    34,  8115,   904, 18149,  6391,  4

In [None]:
print(len(data))
print(data.data.head())

96
   Data_ID  User_ID  Gender       Age  I/E  Q_number                                        QandA
0        1        1       1 -0.994778    0         1  [input_ids, token_type_ids, attention_mask]
1        2        1       1 -0.994778    0         2  [input_ids, token_type_ids, attention_mask]
2        3        1       1 -0.994778    0         3  [input_ids, token_type_ids, attention_mask]
3        4        1       1 -0.994778    0         4  [input_ids, token_type_ids, attention_mask]
4        5        1       1 -0.994778    0         5  [input_ids, token_type_ids, attention_mask]


In [None]:
# 불러와지는지 확인
from dataloader import MBTIDataset

env_dict = {
    # ==== Arguments of dataset =====
    'train_path'        : './data/example_train.csv',
    'question_path'     : './data/question_filtered.csv',
    'target'            : 'E',
    'pretrained_url'    : "klue/bert-base",
    'padding_per_batch' : True,
    # ==== Arguments of dataloader =====
    'batch_size'        : 64,
    'shuffle'           : True
}

# Dataset
train_dataset = MBTIDataset(
    data_path           = env_dict['train_path'],
    question_path       = env_dict['question_path'],
    target_mbti         = env_dict['target'],
    pretrained_url      = env_dict['pretrained_url'],
    padding_per_batch   = env_dict['padding_per_batch'],
    is_train            = True
)

print(len(train_dataset))
print(train_dataset.data.head())

In [None]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

# define collator function when padding per batch is needed
#TODO: data_collator 잘 작동하는지 확인 필요
data_collator = DataCollatorWithPadding(tokenizer=data.tokenizer)

# Dataloader
train_dataloder = DataLoader(
    data,
    batch_size = 16,
    shuffle = False,
    collate_fn = data_collator
)

In [None]:
for step, batch in enumerate(train_dataloder):
  # input_ids, token_type_ids, attention_mask = batch
  # print(batch["token_type_ids"])
  # print(torch.count_nonzero(batch["token_type_ids"][0]))
  print(batch)
  # print('revised len :', len(batch['input_ids']))
  # print(batch["input_ids"].shape)
  break

original len : 56
original len : 83
original len : 92
original len : 92
original len : 79
original len : 87
original len : 100
original len : 48
original len : 104
original len : 77
original len : 70
original len : 76
original len : 63
original len : 84
original len : 77
original len : 70
{'input_ids': tensor([[    2,  7267, 11187,  ...,     0,     0,     0],
        [    2,  3936,  3641,  ...,     0,     0,     0],
        [    2,  3656,  3611,  ...,     0,     0,     0],
        ...,
        [    2,  4051,  4362,  ...,     0,     0,     0],
        [    2,  3656,  3611,  ...,     0,     0,     0],
        [    2,  3971,  3746,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1

In [34]:
import os
import re
from typing import Union

import torch
import pandas as pd
from torch.utils.data import Dataset
from pykospacing import Spacing
from hanspell import spell_checker
from transformers import AutoTokenizer

#TODO:
# 1. train / text 다른 로직이 필요 (데이터 형식이 조금 다름)
# 2. 다른 Text preprocess 방식 도입 검토
#       ref : https://ebbnflow.tistory.com/246

class IIIDataset(Dataset):
    def __init__(
        self,
        data_path     : Union[str, pd.DataFrame],
        question_path : Union[str, pd.DataFrame],
        txt_preprocess: bool            = True,
        normalize     : bool            = True,
        pretrained_url: str             = "klue/bert-base",
        padding_per_batch               = True,
        is_binary_classification: bool  = True,
        is_bert       : bool            = True,
        is_train      : bool            = True
        ):
        """DataLoader for MBTI dataset

        Args:
            data_path (str): Data file path. Both csv and parguet files are allowed.
            question_path (str): Question file path. Both csv and parguet files are allowed.
            txt_preprocess (bool, optional): Text preprocessing pipeline. (e.g. fixing grammar, removing punctuations). Defaults to True.
            normalize (bool, optional): Normalize numeric attribute. Defaults to True.
            is_binary_classification (bool, optional): Target of task. You can choose btw Multi-class classificaiton
                and 4 binary classification problem. Defaults to True.
            is_bert (bool, optional): Using BERT for language model or not. Defaults to True.
            is_train (bool, optional): Whether given data is for training or not. Defaults to True.
        """

        def resolve_path(path:str)->pd.DataFrame:
            if path.endswith('.csv'):
                try:
                    df = pd.read_csv(path)
                except:
                    df = pd.read_csv(path, encoding='cp949')
            else:
                df = pd.read_parquet(path)
            return df

        data = None
        question_data = None
        # if given data_path is pd.Dataframe, we assume preprocessing is already applied to given Dataframe
        # so that it can skip all the processes below
        if not isinstance(data_path, pd.DataFrame):
            data = resolve_path(data_path)
            question_data = resolve_path(question_path)

            self.question_data = question_data

            # preprocess data
            if txt_preprocess:
                self.preprocess_txt(data)
            if normalize:
                data['Age'] = (data['Age'] - data['Age'].mean()) / data['Age'].std()

            #FIXME: 4개의 Dataset 생성은 번거로움. 4개의 Column 을 생성하도록 바꾸기
            # make dataset suitable for binary classification (only for training data - test data doesn't contain 'MBTI' field)
            label_cols = None
            if is_train and is_binary_classification:
                self.prepare_binary_classification(data)
                # if method right above works successfully, then data should contain same # 0 and 1.
                label_cols = ['I/E', 'S/N', 'T/F', 'J/P']
                for col in label_cols:
                    value_counted = data[col].value_counts()
                    assert value_counted[0] == value_counted[1]

            # prepare for language model
            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_url)
            self.padding_per_batch = padding_per_batch
            self.tokenize(data)

        else:
            data = data_path

        # set columns for both training and inference
        #TODO: 테스트 데이터를 고려해서 유저 정보를 학습에 활용하지 않는 상황. 필요시 고쳐야 함
        self.cat_col    = ['Gender']
        self.num_col    = ['Age']
        self.label_cols = ['I/E', 'S/N', 'T/F', 'J/P']
        self.is_train   = is_train
        self.data       = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        selected_data = self.data.iloc[idx]

        cat_input = torch.tensor(selected_data[self.cat_col])                               # [batch size   x   # categorical features]
        num_input = torch.tensor(selected_data[self.num_col])                               # [batch size   x   # numerical features]

        #TODO: Tokenize 의 결과가 자동으로 torch.Tensor 형태로 뽑힘. 이유는...모름
        sample              = selected_data['QandA']                                        # [batch size   x   sequence length]
        sample['cat_input'] = cat_input
        sample['num_input'] = num_input

        # Include label only for training cases
        if self.is_train:
            label     = torch.tensor(selected_data[self.label_cols])                         # [batch size   x   1]
            sample['label'] = label

        return sample

    # ======================
    #    Helper Functions
    # ======================

    def fix_grammar(self, answer: str) -> str:
        print(answer)
        answer = spell_checker.check(answer)
        return answer.checked

    def fix_spacing(self, answer: str) -> str:
        answer  = answer.replace(" ", '')
        spacing = Spacing()
        return spacing(answer)

    def remove_punctuation(self, answer: str) -> str:
        answer = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"]', '', answer)
        answer = re.sub(r'\s+', ' ', answer)        # remove extra space
        answer = re.sub(r"^\s+", '', answer)        # remove space from start
        answer = re.sub(r'\s+$', '', answer)        # remove space from the end
        return answer

    def preprocess_txt(self, data: pd.DataFrame):
        data['Answer'] = data['Answer'].apply(self.fix_grammar)         #FIXME: 해당 패키지의 서버가 가끔 응답 오류가 남. 그럴 땐 주석 처리 필요. 데이터 저장해둘걸!
        data['Answer'] = data['Answer'].apply(self.fix_spacing)
        data['Answer'] = data['Answer'].apply(self.remove_punctuation)

    def prepare_binary_classification(self, data: pd.DataFrame):
        one_list = ['E', 'N', 'F', 'P']
        zero_list = ['I', 'S', 'T', 'J']

        for idx, mbti in enumerate(one_list):
            data[mbti] = data['MBTI'].str       \
                .contains(mbti)                 \
                .replace({True: 1, False: 0})

            new_name = zero_list[idx] + '/' + mbti
            data.rename(columns = {mbti:new_name}, inplace=True)

    def tokenize(self, data: pd.DataFrame):

        def tokenize_per_sentence(series: pd.Series) -> str:
            selected_question = self.question_data.iloc[series['Q_number'] - 1].Question
            selected_answer = series['Answer']

            # print('Q : ', selected_question)
            # print('A : ', selected_answer)

            padding = False if self.padding_per_batch else 'longest'
            #TODO: 필요시 max_length 조절 필요
            return self.tokenizer(selected_question,
                                  selected_answer,
                                  padding=padding,
                                  return_tensors='pt')

        data['QandA'] =  data.apply(tokenize_per_sentence, axis=1)


In [36]:
result = spell_checker.check(u'안녕 하세요. 저는 한국인 입니다. 이문장은 한글로 작성됬습니다.')
result

JSONDecodeError: ignored

In [35]:
data_path = "data/example_train.csv"
question_path = "data/question_filtered.csv"

data_III = IIIDataset(
   data_path,
   question_path
)

<아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의 친구와만 지냅니다.


JSONDecodeError: ignored

In [17]:
print(data_III.data.head())
print(len(data_III))

   Data_ID  User_ID  Gender       Age  MBTI  Q_number                                             Answer  I/E  S/N  T/F  J/P                                        QandA
0        1        1       1 -0.994778  INFP         1  <아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
1        2        1       1 -0.994778  INFP         2  <중립> 다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하며...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
2        3        1       1 -0.994778  INFP         3  <그렇다> 감정이입이 잘 되어 코미디 영화에서 사람이 울고 있을 때도 울기 때문에 ...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
3        4        1       1 -0.994778  INFP         4  <중립> 대비책을 세우긴 하는데 세우다가 마는 편입니다일의 변수가 생길 수 있고 변...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
4        5        1       1 -0.994778  INFP         5  <아니다> 평정심을 유지 못하는 편입니다 머릿속은 백지화가 된 상태로 말도 제대로 ...    0    1    1    1  [input_ids, token_type_i

In [21]:
# Tokenize 의 결과가 tf 로 나오는지 확인
sample = data_III.data.iloc[0]['QandA']
print(type(sample['input_ids']))
print(type(sample['token_type_ids']))
print(type(sample['attention_mask']))

<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>


In [25]:
# pd.DataFrame 도 받아들이는지 확인 -> 잘 나온다
df = data_III.data

data_IIII = IIIDataset(
   df,
   question_path
)

In [26]:
print(data_IIII.data.head())
print(len(data_IIII))

# 이런 식으로 4개의 MBTI column 을 뽑은 후에 4개의 DataLoader 을 만들면 될 듯

   Data_ID  User_ID  Gender       Age  MBTI  Q_number                                             Answer  I/E  S/N  T/F  J/P                                        QandA
0        1        1       1 -0.994778  INFP         1  <아니다> 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
1        2        1       1 -0.994778  INFP         2  <중립> 다양한 관심사를 탐구하진 않지만 대체로 자연과 역사에 관련된 것을 좋아하며...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
2        3        1       1 -0.994778  INFP         3  <그렇다> 감정이입이 잘 되어 코미디 영화에서 사람이 울고 있을 때도 울기 때문에 ...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
3        4        1       1 -0.994778  INFP         4  <중립> 대비책을 세우긴 하는데 세우다가 마는 편입니다일의 변수가 생길 수 있고 변...    0    1    1    1  [input_ids, token_type_ids, attention_mask]
4        5        1       1 -0.994778  INFP         5  <아니다> 평정심을 유지 못하는 편입니다 머릿속은 백지화가 된 상태로 말도 제대로 ...    0    1    1    1  [input_ids, token_type_i

In [None]:
# test_data 에도 확인 -> 되는 듯!
test_path = "data/hackathon_test_for_user.csv"

data_IIII = IIIDataset(
   test_path,
   question_path
)

In [None]:
print(data_IIII.data.head())
print(len(data_IIII))