In [1]:
import pandas as pd
import matplotlib.pyplot as plt
data_file = 'star_classification.csv'
df = pd.read_csv(data_file)
%matplotlib inline

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   obj_ID       100000 non-null  float64
 1   alpha        100000 non-null  float64
 2   delta        100000 non-null  float64
 3   u            100000 non-null  float64
 4   g            100000 non-null  float64
 5   r            100000 non-null  float64
 6   i            100000 non-null  float64
 7   z            100000 non-null  float64
 8   run_ID       100000 non-null  int64  
 9   rerun_ID     100000 non-null  int64  
 10  cam_col      100000 non-null  int64  
 11  field_ID     100000 non-null  int64  
 12  spec_obj_ID  100000 non-null  float64
 13  class        100000 non-null  object 
 14  redshift     100000 non-null  float64
 15  plate        100000 non-null  int64  
 16  MJD          100000 non-null  int64  
 17  fiber_ID     100000 non-null  int64  
dtypes: float64(10), int64(7),

#### Using information gain to select most important features  

In [8]:
X = df.drop(['class','obj_ID'], axis = 1)
Y = df['class']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [9]:
from sklearn.feature_selection import mutual_info_classif
mutual_info = mutual_info_classif(X_train, y_train)
mutual_info

array([0.03966045, 0.04352154, 0.10021772, 0.12172614, 0.0765968 ,
       0.11111502, 0.14735166, 0.14313406, 0.01179939, 0.0018791 ,
       0.00614583, 0.30172973, 0.80221314, 0.27611228, 0.19396157,
       0.04847214])

In [10]:
mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

redshift       0.802213
spec_obj_ID    0.301730
plate          0.276112
MJD            0.193962
z              0.147352
run_ID         0.143134
g              0.121726
i              0.111115
u              0.100218
r              0.076597
fiber_ID       0.048472
delta          0.043522
alpha          0.039660
rerun_ID       0.011799
field_ID       0.006146
cam_col        0.001879
dtype: float64

#### Selecting the 10 most significant featues 

In [24]:
from sklearn.feature_selection import SelectKBest

sel_five_cols = SelectKBest(mutual_info_classif, k=10)
sel_five_cols.fit(X_train, y_train)
cols_list = X_train.columns[sel_five_cols.get_support()]

In [26]:
X_train = X_train[cols_list]
X_test = X_test[cols_list] 

In [29]:
print(X_train.shape, X_test.shape)

(80000, 10) (20000, 10)


#### Now training the random forest classifier to see its performance

In [30]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

In [34]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=40)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9777

In [35]:
for i in range(10, 100, 10):
    clf = RandomForestClassifier(n_estimators=i)
    clf.fit(X_train, y_train)
    print(clf.score(X_test, y_test))

0.9766
0.9771
0.9776
0.9779
0.9779
0.9783
0.97765
0.9779
0.97735


#### Removing outliers 