# Polish Bankruptcy Modelling

## Introduction

In [None]:
!pip install -U torchdata imbalanced-learn

In [None]:
!pip install iopath

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
device = torch.device(device)

cpu


Defining some custom pipeline components. 

- A custom ZipFile extractor so zip files are need not be saved to disk, but can be extracted "mid-stream". 
- An ARFF file format reader 
- Class balancer. 

In [None]:
from typing import Iterable, TextIO, Callable, Tuple, BinaryIO, List, Dict
from zipfile import ZipFile 
import io 

from torchdata.datapipes.iter import IterDataPipe
from torchdata.datapipes import functional_datapipe 

from scipy.io import arff

import numpy as np 
from numpy.typing import ArrayLike


@functional_datapipe('extract_from_zipfile')
class ExtractFromZipFile(IterDataPipe): 
    def __init__(self, dp: Iterable[Tuple[str, BinaryIO]], files:Dict[str, List[str]]): 
        self.dp = dp 
        self.files = files 

    def __iter__(self): 
        for label, bytestring in self.dp:
            for fp in self.files[label]: 
                with ZipFile(io.BytesIO(bytestring)) as z: 
                    data = z.read(fp)
                yield fp, data


@functional_datapipe('read_arff')
class ReadArffDataPipe(IterDataPipe): 
    def __init__(self, dp: Iterable[Tuple[str, TextIO]], featurenames, labelname): 
        self.dp = dp 
        self.featurenames = featurenames 
        self.labelname = labelname

    def __iter__(self): 
        for path, stream in self.dp: 
            data, metadata = arff.loadarff(stream) 
            yield from data


@functional_datapipe('balance') 
class BalancerDataPipe(IterDataPipe): 
    def __init__(self, dp: Iterable[Tuple[ArrayLike, ArrayLike]], balancer_factory, 
                 sampling_strategy:float=1., **balancer_config):
        self.dp = dp 
        self.balancer = balancer_factory(sampling_strategy=sampling_strategy,
                                         **balancer_config) 
        self.buff = []


    def __iter__(self): 
        self.buff = list(self.dp)
        X, y = zip(*self.buff)
        X, y = np.r_['-1', X], np.r_['c', y] 
        X_res, y_res = self.balancer.fit_resample(X, y) 
        yield from zip(X_res, y_res)

    def __len__(self): 
        raise NotImplemented()


In [87]:
import numpy as np 

dataset = np.array([(1,3, 'a'), (2,0, 'c')], dtype=[('1', float), ('2', float), ('class', 'S1')])

In [140]:
features = dataset[['1', '2']]
next(iter(features))

(1., 3.)

In [None]:
from sklearn.preprocessing import StandardScaler 
from sklearn.impute import SimpleImputer 
from imblearn.over_sampling import SVMSMOTE 


svm_smote = SVMSMOTE(sampling_strategy=0.8, random_state=42)
imputer = SimpleImputer() 
scale = StandardScaler()

In [80]:
from torchdata.datapipes.iter import Zipper, IoPathFileOpener, IterableWrapper, HttpReader

URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00365/data.zip'

def reshaper(seq): 
    return np.array(seq).reshape(1,-1)

def feature_label_split(dp, sequence_length):
    *feature_dp, label_dp = dp.unzip(sequence_length=sequence_length)
    return Zipper(*feature_dp).map(reshaper), label_dp

def apply_balancer(feature_dp, label_dp, balancer): 
    return Zipper(feature_dp, label_dp).balance(balancer) 

# datapipe = FileLister(root='data', ).open_files(mode='t').read_arff().shuffle(buffer_size=5000)
source = IterableWrapper([URL])
datapipe = HttpReader(source).read_from_stream()

In [81]:
lbl, bs = next(iter(datapipe))
lbl

'https://archive.ics.uci.edu/ml/machine-learning-databases/00365/data.zip'

In [77]:
from io import BytesIO

In [78]:
with ZipFile(BytesIO(bs)) as z: 
  s1 = z.read('1year.arff')

In [67]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00365/data.zip 


--2022-11-18 11:11:54--  https://archive.ics.uci.edu/ml/machine-learning-databases/00365/data.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8834471 (8.4M) [application/x-httpd-php]
Saving to: ‘data.zip.1’


2022-11-18 11:11:55 (13.2 MB/s) - ‘data.zip.1’ saved [8834471/8834471]



In [45]:
!unzip data.zip

Archive:  data.zip
  inflating: 1year.arff              
  inflating: 2year.arff              
  inflating: 3year.arff              
  inflating: 4year.arff              
  inflating: 5year.arff              


In [60]:
from torchdata.datapipes.iter import ZipArchiveLoader

datapipe = IoPathFileOpener(source)

In [73]:
from zipfile import ZipFile 

with ZipFile('./data.zip') as z: 
  s1 = z.read('2year.arff')




In [74]:
s1

b"@relation '1year-weka.filters.unsupervised.instance.SubsetByExpression-Enot ismissing(ATT20)'\n\n@attribute Attr1 numeric\n@attribute Attr2 numeric\n@attribute Attr3 numeric\n@attribute Attr4 numeric\n@attribute Attr5 numeric\n@attribute Attr6 numeric\n@attribute Attr7 numeric\n@attribute Attr8 numeric\n@attribute Attr9 numeric\n@attribute Attr10 numeric\n@attribute Attr11 numeric\n@attribute Attr12 numeric\n@attribute Attr13 numeric\n@attribute Attr14 numeric\n@attribute Attr15 numeric\n@attribute Attr16 numeric\n@attribute Attr17 numeric\n@attribute Attr18 numeric\n@attribute Attr19 numeric\n@attribute Attr20 numeric\n@attribute Attr21 numeric\n@attribute Attr22 numeric\n@attribute Attr23 numeric\n@attribute Attr24 numeric\n@attribute Attr25 numeric\n@attribute Attr26 numeric\n@attribute Attr27 numeric\n@attribute Attr28 numeric\n@attribute Attr29 numeric\n@attribute Attr30 numeric\n@attribute Attr31 numeric\n@attribute Attr32 numeric\n@attribute Attr33 numeric\n@attribute Attr34 n

In [None]:
from torchdata.datapipes.iter import Zipper, HttpReader, IterableWrapper

URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00365/data.zip'

def reshaper(seq): 
    return np.array(seq).reshape(1,-1)

def feature_label_split(dp, sequence_length):
    *feature_dp, label_dp = dp.unzip(sequence_length=sequence_length)
    return Zipper(*feature_dp).map(reshaper), label_dp

def apply_balancer(feature_dp, label_dp, balancer): 
    return Zipper(feature_dp, label_dp).balance(balancer) 

# datapipe = FileLister(root='data', ).open_files(mode='t').read_arff().shuffle(buffer_size=5000)
source = IterableWrapper([URL])
datapipe = HttpReader(source).decompress(file_type='*.zip') 

datapipe = datapipe.map(float, input_col='class')
train, test = datapipe.random_split(weights={'train': 0.8, 'test':0.2}, seed=42, total_length=43405)

train_feature_dp, train_label_dp = feature_label_split(train, sequence_length=65)

X= DataLoader(train_feature_dp, batch_size=5000) 
X= next(iter(X))
X= X.reshape(5000, 64)

imputer.fit(X) 
scale.fit(X) 

train_feature_dp = train_feature_dp.map(imputer.transform).map(scale.transform)
balanced_train = apply_balancer(train_feature_dp, train_label_dp, svm_smote)

train_dataloader = DataLoader(balanced_train, batch_size=32, shuffle=True)

test_feature_dp, test_label_dp = feature_label_split(test, sequence_length=65)
test_dataloader = DataLoader(test, batch_size=32)

In [None]:
import torch 
from torch import nn 
import torch.nn.functional as F


class SimpleNet(nn.Module): 
    def __init__(self, n_features, n_hidden, n_out): 
        super().__init__()
        self.layer1 = nn.Linear(n_features, n_hidden, dtype=torch.float64) 
        self.layer2 = nn.Linear(n_hidden, n_out, dtype=torch.float64) 
        self.relu = nn.ReLU() 

    def forward(self, x): 
        x = self.relu(self.layer1(x)) 
        x = torch.sigmoid(self.layer2(x)) 
        return x 



In [None]:
import torch 
from torch import nn 
import torch.nn.functional as F


class SimpleMultiLayerPerceptron(nn.Module): 
    def __init__(self, *n_hidden_units): 
        super().__init__()
        self.layers = [nn.Linear(in_, out, dtype=torch.float) for in_, out in 
                       zip(n_hidden_units, n_hidden_units[1:])]
        self.relu = nn.ReLU() 

    def forward(self, x): 
        for layer in self.layers:
          x = self.relu(layer(x))
        x = torch.sigmoid(x) 
        return x 



In [None]:
from torch.nn import BCELoss
from torch.optim import Adam

net = SimpleNet(64, 10, 1).to(device)

loss_fn = BCELoss()
optimizer = Adam(net.parameters(), lr=0.05, weight_decay=0.1 ) 

In [None]:
MAX_EPOCHS = 2
for epoch in range(MAX_EPOCHS): 
    for it, (batch, labels) in enumerate(train_dataloader): 
        output = net(batch) 
        loss = loss_fn(output, labels) 

        optimizer.zero_grad() 
        loss.backward()
        optimizer.step()

        if it % 200 == 0:
            loss = loss.item()
            print(f'Iter: {it: >4}  Loss:{loss: >0.12f}')
            
    recall, precis = 0, 0
    with torch.no_grad(): 
        for batch, labels in test_dataloader: 
            pred = net(batch) 
            pred = (pred >= 0.5).type(torch.float64)
            recall += (pred[labels==1] == labels[labels==1]).type(torch.float64).sum().item()
            precis += (pred[pred==1] == labels[pred==1]).type(torch.float64).sum().item()
    
    recall /= len(test_dataloader)
    precis /= len(test_dataloader)
    print(f"Recall: {recall: >0.3f} Precision: {precis: >0.3f}")



  0%|          | 0/2172 [00:00<?, ?it/s]

Iter:    0  Loss:0.846033183566
Iter:  200  Loss:0.132341444050
Iter:  400  Loss:0.232565873371


KeyboardInterrupt: 