In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [45]:
# Columns from data specification provided by dataset authors
columns = ["seismic", "seismoacoustic", "shift", "genergy", "gpuls", "gdenergy", "gdpuls", "ghazard", "nbumps",
          "nbumps2", "nbumps3", "nbumps4", "nbumps5", "nbumps6", "nbumps7", "nbumps89", "energy", "maxenergy", "class"]
# File extension is not .csv, but by removing the comments in the file, it can be imported as a .csv file
data = pd.read_csv("seismic-bumps.arff", names=columns)

In [46]:
print(data)

     seismic seismoacoustic shift  genergy  gpuls  gdenergy  gdpuls ghazard  \
0          a              a     N    15180     48       -72     -72       a   
1          a              a     N    14720     33       -70     -79       a   
2          a              a     N     8050     30       -81     -78       a   
3          a              a     N    28820    171       -23      40       a   
4          a              a     N    12640     57       -63     -52       a   
...      ...            ...   ...      ...    ...       ...     ...     ...   
2579       b              a     W    81410    785       432     151       b   
2580       b              a     W    42110    555       213     118       a   
2581       b              a     W    26960    540       101     112       a   
2582       a              a     W    16130    322         2       2       a   
2583       a              a     W    12750    235       -10     -10       a   

      nbumps  nbumps2  nbumps3  nbumps4  nbumps5  n

In [47]:
X = data.iloc[:, 0:-1]
y = data["class"]
print("X: ", X)
print("Y: ", y)

X:       seismic seismoacoustic shift  genergy  gpuls  gdenergy  gdpuls ghazard  \
0          a              a     N    15180     48       -72     -72       a   
1          a              a     N    14720     33       -70     -79       a   
2          a              a     N     8050     30       -81     -78       a   
3          a              a     N    28820    171       -23      40       a   
4          a              a     N    12640     57       -63     -52       a   
...      ...            ...   ...      ...    ...       ...     ...     ...   
2579       b              a     W    81410    785       432     151       b   
2580       b              a     W    42110    555       213     118       a   
2581       b              a     W    26960    540       101     112       a   
2582       a              a     W    16130    322         2       2       a   
2583       a              a     W    12750    235       -10     -10       a   

      nbumps  nbumps2  nbumps3  nbumps4  nbumps

In [48]:
from sklearn.preprocessing import LabelEncoder
# The features seismic, seismoacousatic, shift and ghzard are categorical data consisting of characters. 
# They must be converted to numeric values
X["seismic"] = LabelEncoder().fit_transform(X["seismic"])
X["seismoacoustic"] = LabelEncoder().fit_transform(X["seismoacoustic"])
X["shift"] = LabelEncoder().fit_transform(X["shift"])
X["ghazard"] = LabelEncoder().fit_transform(X["ghazard"])
print(X)

      seismic  seismoacoustic  shift  genergy  gpuls  gdenergy  gdpuls  \
0           0               0      0    15180     48       -72     -72   
1           0               0      0    14720     33       -70     -79   
2           0               0      0     8050     30       -81     -78   
3           0               0      0    28820    171       -23      40   
4           0               0      0    12640     57       -63     -52   
...       ...             ...    ...      ...    ...       ...     ...   
2579        1               0      1    81410    785       432     151   
2580        1               0      1    42110    555       213     118   
2581        1               0      1    26960    540       101     112   
2582        0               0      1    16130    322         2       2   
2583        0               0      1    12750    235       -10     -10   

      ghazard  nbumps  nbumps2  nbumps3  nbumps4  nbumps5  nbumps6  nbumps7  \
0           0       0        0  

In [50]:
# Scaling numeric data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
# Need to turn back into a DataFrame, or preserve the existing DataFrame structure
print(X)

[[-0.73230209 -0.77142023 -1.34374329 ...  0.         -0.24332671
  -0.22108685]
 [-0.73230209 -0.77142023 -1.34374329 ...  0.         -0.14551225
  -0.11774749]
 [-0.73230209 -0.77142023 -1.34374329 ...  0.         -0.24332671
  -0.22108685]
 ...
 [ 1.36555667 -0.77142023  0.74418976 ...  0.         -0.24332671
  -0.22108685]
 [-0.73230209 -0.77142023  0.74418976 ...  0.         -0.24332671
  -0.22108685]
 [-0.73230209 -0.77142023  0.74418976 ...  0.         -0.24332671
  -0.22108685]]
