In [42]:
import pandas as pd
import numpy as np

col_names = pd.read_csv('../data/dataset.csv', nrows=0).columns
dtype_map = {'class' : np.int16, 'bankruptcy_after_years' : np.int16, 'year' : np.int16}
dtype_map.update({col: np.float64 for col in col_names if col not in dtype_map})

df = pd.read_csv('../data/dataset.csv', dtype=dtype_map)
df = df.drop([df.columns[0], df.columns[1], df.columns[2]], axis=1)
df.drop_duplicates(keep=False, inplace=True)
df.head()


Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64,class,bankruptcy_after_years,year
0,0.20055,0.37951,0.39641,2.0472,32.351,0.38825,0.24976,1.3305,1.1389,0.50494,...,0.87804,0.001924,8.416,5.1372,82.658,4.4158,7.4277,0,0,1
1,0.20912,0.49988,0.47225,1.9447,14.786,0.0,0.25834,0.99601,1.6996,0.49788,...,0.853,0.0,4.1486,3.2732,107.35,3.4,60.987,0,0,1
2,0.24866,0.69592,0.26713,1.5548,-1.1523,0.0,0.30906,0.43695,1.309,0.30408,...,0.76599,0.69484,4.9909,3.951,134.27,2.7185,5.2078,0,0,1
3,0.081483,0.30734,0.45879,2.4928,51.952,0.14988,0.092704,1.8661,1.0571,0.57353,...,0.94598,0.0,4.5746,3.6147,86.435,4.2228,5.5497,0,0,1
4,0.18732,0.61323,0.2296,1.4063,-7.3128,0.18732,0.18732,0.6307,1.1559,0.38677,...,0.86515,0.12444,6.3985,4.3158,127.21,2.8692,7.898,0,0,1


The following variables are mapped to **int16** data type:
a) **class** - binary output variable that is suitable for 2-class classification;
b) **bankruptcy_after_years** - discrete output variable with 5 possible values that is suitable for multiclass classification, range [0, 5], where 0 indicates firm was ot bakrupted, 1..5 - firm bakrupted after 1..5 years;
c) **year** - engineered discrete input variable that indicates the year when the observation was made, range [1, 5].

In [39]:
df['class'].value_counts()

0    40534
1     2075
Name: class, dtype: int64

In [40]:
df['bankruptcy_after_years'].value_counts()

0    40534
2      511
3      491
1      406
4      396
5      271
Name: bankruptcy_after_years, dtype: int64

In [41]:
df['year'].value_counts()

3    10329
2     9999
4     9628
1     6863
5     5790
Name: year, dtype: int64

In [33]:
df.shape

(42609, 67)

In [44]:
df.isnull().sum()

Attr1                        8
Attr2                        8
Attr3                        8
Attr4                      132
Attr5                       89
Attr6                        8
Attr7                        8
Attr8                       92
Attr9                        9
Attr10                       8
Attr11                      44
Attr12                     132
Attr13                     123
Attr14                       8
Attr15                      36
Attr16                      93
Attr17                      92
Attr18                       8
Attr19                     124
Attr20                     123
Attr21                    5818
Attr22                       8
Attr23                     123
Attr24                     922
Attr25                       8
Attr26                      93
Attr27                    2736
Attr28                     796
Attr29                       8
Attr30                     123
                          ... 
Attr38                       8
Attr39  

In [38]:
df.describe()

Unnamed: 0,Attr1,Attr2,Attr3,Attr4,Attr5,Attr6,Attr7,Attr8,Attr9,Attr10,...,Attr55,Attr56,Attr57,Attr58,Attr59,Attr60,Attr61,Attr62,Attr63,Attr64
count,42601.0,42601.0,42601.0,42477.0,42520.0,42601.0,42601.0,42517.0,42600.0,42601.0,...,42608.0,42486.0,42602.0,42529.0,42602.0,40485.0,42509.0,42486.0,42477.0,41813.0
mean,0.046504,0.593533,0.112906,6.373712,-387.0064,-0.059096,0.102496,11.40672,2.495925,0.582465,...,7533.172,-26.70974,-0.009476,30.5683,1.34262,456.2674,17.19515,1516.469,9.39301,72.504672
std,2.456032,5.896904,5.489866,298.1816,61807.98,7.268185,5.617762,468.427189,57.706526,13.221351,...,70252.29,5377.293,13.796459,5383.895,123.236523,32650.91,558.182979,140551.4,125.325358,2386.499058
min,-463.89,-430.87,-479.96,-0.40311,-11903000.0,-508.41,-517.48,-141.41,-3.496,-479.91,...,-1805200.0,-1108300.0,-1667.3,-198.69,-327.97,-12.44,-12.656,-2336500.0,-1.5432,-10677.0
25%,0.003347,0.27098,0.021584,1.0494,-49.15,0.0,0.005704,0.42694,1.0192,0.29379,...,29.50025,0.00917025,0.014879,0.87593,0.0,5.5503,4.5229,42.01025,3.1016,2.1927
50%,0.04966,0.47305,0.19718,1.5696,-0.80879,0.0,0.05957,1.0642,1.20215,0.50463,...,1071.05,0.0527745,0.12041,0.95119,0.005979,9.808,6.6574,71.1965,5.096,4.3283
75%,0.12987,0.69011,0.40396,2.7836,50.83125,0.086128,0.15127,2.5892,2.07785,0.70724,...,4879.0,0.12847,0.286195,0.99272,0.236988,20.325,10.451,116.96,8.6228,9.8948
max,94.28,480.96,28.336,53433.0,1250100.0,543.25,649.23,53432.0,9742.3,1099.5,...,6123700.0,293.15,552.64,1108300.0,23853.0,4818700.0,108000.0,25016000.0,23454.0,294770.0
