In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler

In [2]:
SEED = 2024

In [3]:
# Loading the data
df = pd.read_csv("HTRU_2.csv",sep=",",header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,140.5625,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0


In [5]:
len(df)

17898

In [6]:
df[8].value_counts()

0    16259
1     1639
Name: 8, dtype: int64

In [7]:
labels = df[8]

In [8]:
labels

0        0
1        0
2        0
3        0
4        0
        ..
17893    0
17894    0
17895    0
17896    0
17897    0
Name: 8, Length: 17898, dtype: int64

In [9]:
df = df.drop([8],axis=1)
df["Labels"] = labels
df

Unnamed: 0,0,1,2,3,4,5,6,7,Labels
0,140.562500,55.683782,-0.234571,-0.699648,3.199833,19.110426,7.975532,74.242225,0
1,102.507812,58.882430,0.465318,-0.515088,1.677258,14.860146,10.576487,127.393580,0
2,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
3,136.750000,57.178449,-0.068415,-0.636238,3.642977,20.959280,6.896499,53.593661,0
4,88.726562,40.672225,0.600866,1.123492,1.178930,11.468720,14.269573,252.567306,0
...,...,...,...,...,...,...,...,...,...
17893,136.429688,59.847421,-0.187846,-0.738123,1.296823,12.166062,15.450260,285.931022,0
17894,122.554688,49.485605,0.127978,0.323061,16.409699,44.626893,2.945244,8.297092,0
17895,119.335938,59.935939,0.159363,-0.743025,21.430602,58.872000,2.499517,4.595173,0
17896,114.507812,53.902400,0.201161,-0.024789,1.946488,13.381731,10.007967,134.238910,0


In [10]:
df["Labels"].value_counts()

0    16259
1     1639
Name: Labels, dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17898 entries, 0 to 17897
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       17898 non-null  float64
 1   1       17898 non-null  float64
 2   2       17898 non-null  float64
 3   3       17898 non-null  float64
 4   4       17898 non-null  float64
 5   5       17898 non-null  float64
 6   6       17898 non-null  float64
 7   7       17898 non-null  float64
 8   Labels  17898 non-null  int64  
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


In [None]:
# Splitting to learning and testing sets

In [12]:
X_train_, X_test = train_test_split(df, test_size=0.1, random_state=SEED) 

In [13]:
X_train_

Unnamed: 0,0,1,2,3,4,5,6,7,Labels
11583,93.734375,50.264221,0.437706,0.207748,24.388796,56.341590,2.145113,3.177422,0
15516,114.054688,52.423204,0.149742,-0.286647,2.520903,16.242839,9.467739,109.048145,0
3462,87.804688,39.197827,0.721681,2.058747,2.642977,14.066688,8.387087,103.761808,0
16313,142.265625,49.541095,-0.188452,-0.241917,2.411371,14.191170,10.187500,139.921849,0
2483,116.031250,56.691178,0.092955,-0.493398,4.192308,25.302976,6.420555,42.167027,0
...,...,...,...,...,...,...,...,...,...
16567,128.125000,48.357555,-0.005863,-0.100869,2.019231,15.440807,10.652478,137.918490,0
2494,135.554688,41.715706,-0.047587,0.910534,2.617893,17.409786,8.831467,89.780556,0
14875,124.976562,45.683946,-0.209657,0.109942,4.981605,26.593647,5.563959,31.319089,0
2688,125.304688,49.947873,0.053109,-0.068939,1.628763,12.247147,12.262394,195.921439,0


In [14]:
# Splitting the learning set to training and validation set
X_train, X_valid = train_test_split(X_train_, test_size=0.1/0.9, random_state=SEED) 

In [15]:
X_train.columns

Index([0, 1, 2, 3, 4, 5, 6, 7, 'Labels'], dtype='object')

In [16]:
cols = X_train.columns[:-1]
cols

Index([0, 1, 2, 3, 4, 5, 6, 7], dtype='object')

In [17]:
len(X_train), len(X_valid), len(X_test)

(14318, 1790, 1790)

In [18]:
X_train["Labels"].value_counts()

0    13014
1     1304
Name: Labels, dtype: int64

In [19]:
X_valid["Labels"].value_counts()

0    1627
1     163
Name: Labels, dtype: int64

In [20]:
X_test["Labels"].value_counts()

0    1618
1     172
Name: Labels, dtype: int64

In [17]:
# Remove the pulsar elements from the training set
X_train = X_train[X_train["Labels"]==0]

In [18]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,Labels
10814,134.507812,48.756089,-0.031636,-0.131966,0.764214,11.915867,18.587707,369.896839,0
8686,113.007812,55.334847,0.127174,0.012195,2.903010,16.116651,7.873506,79.343598,0
6211,141.195312,41.906529,0.042273,0.392180,2.548495,16.650554,8.976223,95.091538,0
1162,120.062500,48.030386,-0.028265,0.028709,30.127926,62.095604,1.827059,1.878257,0
2750,125.609375,51.790780,-0.016635,-0.091361,18.428094,52.283802,2.765830,6.372460,0
...,...,...,...,...,...,...,...,...,...
8685,126.125000,42.971284,-0.059258,0.265109,2.623746,19.325817,8.561876,80.169221,0
1354,113.398438,40.149397,0.322763,0.532786,2.985786,16.622323,8.176916,83.887394,0
15039,147.390625,43.347296,-0.142813,0.241722,2.954849,18.619306,8.543222,83.139408,0
16869,121.476562,46.342469,0.148239,0.185344,12.081940,38.737798,3.231075,9.644188,0


In [19]:
# Scaling
scaler = StandardScaler()

In [20]:
scaler.fit(X_train[cols])

StandardScaler()

In [21]:
X_train[cols] = scaler.transform(X_train[cols])
X_valid[cols] = scaler.transform(X_valid[cols])
X_test[cols]  = scaler.transform(X_test[cols])

In [22]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,Labels
10814,1.026617,0.224251,-0.721116,-0.491003,-0.331863,-0.681371,2.305363,2.426766,0
8686,-0.204894,1.294407,-0.247401,-0.351706,-0.243892,-0.428786,-0.234857,-0.323333,0
6211,1.409674,-0.889956,-0.500653,0.015460,-0.258474,-0.396684,0.026585,-0.174278,0
1162,0.199196,0.106202,-0.711063,-0.335749,0.875893,2.335835,-1.668404,-1.056546,0
2750,0.516919,0.717899,-0.676371,-0.451768,0.394668,1.745871,-1.445831,-1.014008,0
...,...,...,...,...,...,...,...,...,...
8685,0.546453,-0.716754,-0.803512,-0.107324,-0.255379,-0.235826,-0.071652,-0.315519,0
1354,-0.182519,-1.175786,0.336025,0.151322,-0.240488,-0.398381,-0.162922,-0.280326,0
15039,1.764539,-0.655589,-1.052749,-0.129923,-0.241760,-0.278307,-0.076075,-0.287406,0
16869,0.280193,-0.168369,-0.184564,-0.184398,0.133645,0.931377,-1.335527,-0.983041,0


In [23]:
# Class sizes
X_train["Labels"].value_counts(), X_valid["Labels"].value_counts(), X_test["Labels"].value_counts()

(0    13014
 Name: Labels, dtype: int64,
 0    1627
 1     163
 Name: Labels, dtype: int64,
 0    1618
 1     172
 Name: Labels, dtype: int64)

In [24]:
# Element numbers
len(X_train), len(X_valid), len(X_test)

(13014, 1790, 1790)

In [26]:
# Dropping the label column from the training set
X_train = X_train.drop("Labels",axis=1)

In [27]:
# Saving to disk in the required format 
X_train.to_csv("htru2_train.csv",header=None,index=False,sep=" ")
X_valid.to_csv("htru2_valid.csv",header=None,index=False,sep=" ")
X_test.to_csv("htru2_test.csv",header=None,index=False,sep=" ")
len(X_train), len(X_valid), len(X_test), len(X_train.columns)

(13014, 1790, 1790, 8)

In [28]:
X_valid.columns, X_test.columns

(Index([0, 1, 2, 3, 4, 5, 6, 7, 'Labels'], dtype='object'),
 Index([0, 1, 2, 3, 4, 5, 6, 7, 'Labels'], dtype='object'))