<a href="https://colab.research.google.com/github/zhangguanheng66/text/blob/arrow_dataset/examples/arraw_dataset/AI_hackathon_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture

!rm -r /usr/local/lib/python3.6/dist-packages/torch*;
!pip install --pre torch torchtext -f https://download.pytorch.org/whl/nightly/cu101/torch_nightly.html;
!pip install --upgrade --force-reinstall pyarrow;
!pip install --upgrade --force-reinstall datasets;
!pip install pytorch-lightning

## Prepare PyTorch-Lightning Module

In [2]:
import torch
from torch.nn import functional as F
from torch import nn
from pytorch_lightning.core.lightning import LightningModule

class TextClassificationModel(LightningModule):

  def __init__(self, vocab_size, embed_dim, num_class, learning_rate):
    super().__init__()
    self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
    self.fc = nn.Linear(embed_dim, num_class)
    self.lr = learning_rate
    self.init_weights()

  def init_weights(self):
    initrange = 0.5
    self.embedding.weight.data.uniform_(-initrange, initrange)
    self.fc.weight.data.uniform_(-initrange, initrange)
    self.fc.bias.data.zero_()

  def forward(self, text, offsets):
    embedded = self.embedding(text, offsets)
    return self.fc(embedded)

  def configure_optimizers(self):
    return torch.optim.SGD(self.parameters(), lr=self.lr)

  def training_step(self, batch, batch_idx):
    labels, texts, offsets = batch
    predited_label = self(texts, offsets)
    loss = torch.nn.functional.cross_entropy(predited_label, labels)
    return loss

## Prepare data

In [3]:
# import datasets as ds
# from pathlib import Path

# import pyarrow as pa
# import pandas as pd
# from torchtext.utils import download_from_url, unicode_csv_reader
# import io

# base_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/'
# train_filepath = download_from_url(base_url + 'train.csv')
# test_filepath = download_from_url(base_url + 'test.csv')

# def create_data_from_csv(data_path):
#     with io.open(data_path, encoding="utf8") as f:
#         reader = unicode_csv_reader(f)
#         for row in reader:
#             yield (int(row[0]), ' '.join(row[1:]))

# def convert_to_arrow(file_path, raw_data):
#     """ Write labels and texts into HF dataset"""
#     labels, texts = zip(*raw_data)    
#     return ds.Dataset.from_dict(
#         {
#             "labels": labels,
#             "texts": texts
#         }).save_to_disk(file_path)
    
# raw_train_data = list(create_data_from_csv(train_filepath))
# raw_test_data = list(create_data_from_csv(test_filepath))
# train_ds = convert_to_arrow('train_arrow', raw_train_data)
# test_ds = convert_to_arrow('test_arrow', raw_test_data)
# train_ds = ds.Dataset.load_from_disk('train_arrow') # raw dataset
# test_ds = ds.Dataset.load_from_disk('test_arrow')  # raw dataset

In [4]:
# # Build vocabulary
# from torchtext.experimental.vocab import build_vocab_from_iterator
# from torchtext.experimental.transforms import basic_english_normalize
# tokenizer = basic_english_normalize()
# vocab = build_vocab_from_iterator(iter(tokenizer(line)
#                                        for line in train_ds['texts']))
# tokenizer = tokenizer.to_ivalue() # no need to call to_ivalue soon
# vocab = vocab.to_ivalue() # no need to call to_ivalue soon

In [5]:
# # Process raw text data
# # Call built-in map func to process the raw dataset
# # the callable func must be pickable
# def process_raw_data(arrow_ds):
#   processed_arrow_ds = arrow_ds.map(function=lambda x: {'labels': int(x) - 1}, input_columns='labels')
#   processed_arrow_ds = processed_arrow_ds.map(function=lambda x: {'texts': vocab(tokenizer(x))}, input_columns='texts')
#   return processed_arrow_ds

# processed_train_ds = process_raw_data(train_ds)
# processed_test_ds = process_raw_data(test_ds)

## Prepare DataLoader

In [6]:
# import torch
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# def generate_batch(batch):
#     label_list, text_list, offsets = [], [], [0]
#     for item in batch:
#         #  print(item)
#          label_list.append(item['labels'])
#          processed_text = torch.tensor(item['texts'], dtype=torch.int64)
#          text_list.append(processed_text)
#          offsets.append(processed_text.size(0))
#     label_list = torch.tensor(label_list, dtype=torch.int64)
#     offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
#     text_list = torch.cat(text_list)
#     return label_list, text_list, offsets    

# train_dataloader = torch.utils.data.DataLoader(processed_train_ds, shuffle=True,
#                                                batch_size=16, num_workers=2,
#                                                collate_fn=generate_batch)
# test_dataloader = torch.utils.data.DataLoader(processed_test_ds, shuffle=True,
#                                               batch_size=16, num_workers=2,
#                                               collate_fn=generate_batch)

## [WIP] Preapre PL DataModule

In [25]:
import io
import torch
import datasets as ds
from torchtext.experimental.vocab import build_vocab_from_iterator
from torchtext.experimental.transforms import basic_english_normalize
from torchtext.utils import download_from_url, unicode_csv_reader
from pytorch_lightning import LightningDataModule

def create_data_from_csv(data_path):
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f)
        for row in reader:
            yield (int(row[0]), ' '.join(row[1:]))

def convert_to_arrow(file_path, raw_data):
    """ Write labels and texts into HF dataset"""
    labels, texts = zip(*raw_data)    
    return ds.Dataset.from_dict(
        {
            "labels": labels,
            "texts": texts
        }).save_to_disk(file_path)

def process_raw_data(arrow_ds, tokenizer, vocab):
    processed_arrow_ds = arrow_ds.map(function=lambda x: {'labels': int(x) - 1}, input_columns='labels')
    processed_arrow_ds = processed_arrow_ds.map(function=lambda x: {'texts': vocab(tokenizer(x))}, input_columns='texts')
    return processed_arrow_ds

def generate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for item in batch:
        #  print(item)
         label_list.append(item['labels'])
         processed_text = torch.tensor(item['texts'], dtype=torch.int64)
         text_list.append(processed_text)
         offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list, text_list, offsets    

class TextClassificationDataModule(LightningDataModule):
    def __init__(self):
        super().__init__()
        self.base_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/'
        self.train_filepath = download_from_url(self.base_url + 'train.csv')
        self.test_filepath = download_from_url(self.base_url + 'test.csv')
        raw_train_data = list(create_data_from_csv(self.train_filepath))
        raw_test_data = list(create_data_from_csv(self.test_filepath))
        train_ds = convert_to_arrow('train_arrow', raw_train_data)
        test_ds = convert_to_arrow('test_arrow', raw_test_data)
        # self.tokenizer = None
        # Build vocabulary
        self.tokenizer = basic_english_normalize().to_ivalue()
        train_ds = ds.Dataset.load_from_disk('train_arrow')
        self.vocab = build_vocab_from_iterator(iter(self.tokenizer(line)
                                       for line in train_ds['texts'])).to_ivalue()

    def setup(self, stage):
        # raw_train_data = list(create_data_from_csv(self.train_filepath))
        # raw_test_data = list(create_data_from_csv(self.test_filepath))
        # train_ds = convert_to_arrow('train_arrow', raw_train_data)
        # test_ds = convert_to_arrow('test_arrow', raw_test_data)
        self.train = ds.Dataset.load_from_disk('train_arrow') # raw dataset
        self.valid = ds.Dataset.load_from_disk('test_arrow')  # raw dataset

        # Build vocabulary
        # self.tokenizer = basic_english_normalize().to_ivalue()
        # self.vocab = build_vocab_from_iterator(iter(self.tokenizer(line)
        #                                for line in self.train['texts'])).to_ivalue()

    def train_dataloader(self):
        self.train = process_raw_data(self.train, self.tokenizer, self.vocab)
        return torch.utils.data.DataLoader(self.train, shuffle=True,
                                               batch_size=16, num_workers=1,
                                               collate_fn=generate_batch)
        
    def valid_dataloader(self):
        self.valid = process_raw_data(self.valid, self.tokenizer, self.vocab)
        return torch.utils.data.DataLoader(self.valid, shuffle=True,
                                               batch_size=16, num_workers=1,
                                               collate_fn=generate_batch)

# base_url = 'https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv/'
# train_filepath = download_from_url(base_url + 'train.csv')
# test_filepath = download_from_url(base_url + 'test.csv')

# def create_data_from_csv(data_path):
#     with io.open(data_path, encoding="utf8") as f:
#         reader = unicode_csv_reader(f)
#         for row in reader:
#             yield (int(row[0]), ' '.join(row[1:]))

# def convert_to_arrow(file_path, raw_data):
#     """ Write labels and texts into HF dataset"""
#     labels, texts = zip(*raw_data)    
#     return ds.Dataset.from_dict(
#         {
#             "labels": labels,
#             "texts": texts
#         }).save_to_disk(file_path)
    
# raw_train_data = list(create_data_from_csv(train_filepath))
# raw_test_data = list(create_data_from_csv(test_filepath))
# train_ds = convert_to_arrow('train_arrow', raw_train_data)
# test_ds = convert_to_arrow('test_arrow', raw_test_data)
# train_ds = ds.Dataset.load_from_disk('train_arrow') # raw dataset
# test_ds = ds.Dataset.load_from_disk('test_arrow')  # raw dataset

# # Build vocabulary
# from torchtext.experimental.vocab import build_vocab_from_iterator
# from torchtext.experimental.transforms import basic_english_normalize
# tokenizer = basic_english_normalize()
# vocab = build_vocab_from_iterator(iter(tokenizer(line)
#                                        for line in train_ds['texts']))
# tokenizer = tokenizer.to_ivalue() # no need to call to_ivalue soon
# vocab = vocab.to_ivalue() # no need to call to_ivalue soon

# def process_raw_data(arrow_ds):
#   processed_arrow_ds = arrow_ds.map(function=lambda x: {'labels': int(x) - 1}, input_columns='labels')
#   processed_arrow_ds = processed_arrow_ds.map(function=lambda x: {'texts': vocab(tokenizer(x))}, input_columns='texts')
#   return processed_arrow_ds

# processed_train_ds = process_raw_data(train_ds)
# processed_test_ds = process_raw_data(test_ds)

## Initiate PyTorch-Lightning Trainer

In [26]:
from pytorch_lightning import Trainer
LR = 5  # learning rate
NUM_CLASS = 4
EMBED = 256

data_module = TextClassificationDataModule()
model = TextClassificationModel(len(data_module.vocab), EMBED, NUM_CLASS, LR)
trainer = Trainer(gpus=1, max_epochs=3, progress_bar_refresh_rate=40)
trainer.fit(model, data_module)

GPU available: True, used: True
INFO:lightning:GPU available: True, used: True
TPU available: False, using: 0 TPU cores
INFO:lightning:TPU available: False, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type         | Params
-------------------------------------------
0 | embedding | EmbeddingBag | 24.5 M
1 | fc        | Linear       | 1.0 K 
INFO:lightning:
  | Name      | Type         | Params
-------------------------------------------
0 | embedding | EmbeddingBag | 24.5 M
1 | fc        | Linear       | 1.0 K 


HBox(children=(FloatProgress(value=0.0, max=120000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=120000.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…




1