In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv('6 class csv.csv')

In [3]:
df.head()

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Red,M
1,3042,0.0005,0.1542,16.6,0,Red,M
2,2600,0.0003,0.102,18.7,0,Red,M
3,2800,0.0002,0.16,16.65,0,Red,M
4,1939,0.000138,0.103,20.06,0,Red,M


In [4]:
df.isnull().sum()

Temperature (K)           0
Luminosity(L/Lo)          0
Radius(R/Ro)              0
Absolute magnitude(Mv)    0
Star type                 0
Star color                0
Spectral Class            0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 240 entries, 0 to 239
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Temperature (K)         240 non-null    int64  
 1   Luminosity(L/Lo)        240 non-null    float64
 2   Radius(R/Ro)            240 non-null    float64
 3   Absolute magnitude(Mv)  240 non-null    float64
 4   Star type               240 non-null    int64  
 5   Star color              240 non-null    object 
 6   Spectral Class          240 non-null    object 
dtypes: float64(3), int64(2), object(2)
memory usage: 13.3+ KB


In [7]:
# preprocessing

In [8]:
df['Star color'].unique()

array(['Red', 'Blue White', 'White', 'Yellowish White', 'Blue white',
       'Pale yellow orange', 'Blue', 'Blue-white', 'Whitish',
       'yellow-white', 'Orange', 'White-Yellow', 'white', 'Blue ',
       'yellowish', 'Yellowish', 'Orange-Red', 'Blue white ',
       'Blue-White'], dtype=object)

In [9]:
df['Spectral Class'].unique()

array(['M', 'B', 'A', 'F', 'O', 'K', 'G'], dtype=object)

In [10]:
def onehot_encode(data, column, prefix):
    data = data.copy()
    dummies = pd.get_dummies(data[column], prefix=prefix)
    data = pd.concat([data, dummies], axis=1)
    data = data.drop(column, axis=1)
    return data

In [11]:
def preprocess_inputs(data):
    data = data.copy()
    
    # Fix color values
    color_mapping = {
        'white': 'White',
        'Blue ': 'Blue',
        'Blue white': 'Blue White',
        'Blue-white': 'Blue White',
        'Blue white ': 'Blue White',
        'Blue-White': 'Blue White',
        'yellow-white':'Yellowish White',
        'White-Yellow':'Yellowish White',
        'yellowish': 'Yellowish'
    }
    data['Star color'] = data['Star color'].replace(color_mapping)
    
    # One-hot encode
    data = onehot_encode(data, column='Star color', prefix="Color")
    data = onehot_encode(data, column='Spectral Class', prefix="Class")
    
    # Split df into X and y
    y = data['Star type']
    X = data.drop('Star type', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [12]:
X_train, X_test, y_train, y_test = preprocess_inputs(df)

In [13]:
X_train

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Color_Blue,Color_Blue White,Color_Orange,Color_Orange-Red,Color_Pale yellow orange,Color_Red,...,Color_Whitish,Color_Yellowish,Color_Yellowish White,Class_A,Class_B,Class_F,Class_G,Class_K,Class_M,Class_O
182,-0.768434,-0.560864,-0.404550,1.207543,-0.540655,-0.512989,-0.109764,-0.077382,0.0,1.061406,...,0.0,-0.077382,-0.175142,-0.289605,-0.540655,-0.223607,0.0,-0.13484,1.048809,-0.427900
59,-0.755787,0.504620,2.904125,-1.570978,-0.540655,-0.512989,-0.109764,-0.077382,0.0,1.061406,...,0.0,-0.077382,-0.175142,-0.289605,-0.540655,-0.223607,0.0,-0.13484,1.048809,-0.427900
168,0.754059,1.312749,-0.340610,-1.064657,1.849609,-0.512989,-0.109764,-0.077382,0.0,-0.942147,...,0.0,-0.077382,-0.175142,-0.289605,-0.540655,-0.223607,0.0,-0.13484,-0.953463,2.336993
221,0.248815,1.256032,-0.242155,-1.154008,1.849609,-0.512989,-0.109764,-0.077382,0.0,-0.942147,...,0.0,-0.077382,-0.175142,-0.289605,-0.540655,-0.223607,0.0,-0.13484,-0.953463,2.336993
138,-0.778792,-0.560864,-0.404092,0.695457,-0.540655,-0.512989,-0.109764,-0.077382,0.0,1.061406,...,0.0,-0.077382,-0.175142,-0.289605,-0.540655,-0.223607,0.0,-0.13484,1.048809,-0.427900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
137,-0.748918,-0.560864,-0.403622,0.890492,-0.540655,-0.512989,-0.109764,-0.077382,0.0,1.061406,...,0.0,-0.077382,-0.175142,-0.289605,-0.540655,-0.223607,0.0,-0.13484,1.048809,-0.427900
72,-0.780973,-0.560864,-0.404435,0.788651,-0.540655,-0.512989,-0.109764,-0.077382,0.0,1.061406,...,0.0,-0.077382,-0.175142,-0.289605,-0.540655,-0.223607,0.0,-0.13484,1.048809,-0.427900
140,0.321974,-0.560864,-0.404799,0.833807,-0.540655,1.949359,-0.109764,-0.077382,0.0,-0.942147,...,0.0,-0.077382,-0.175142,-0.289605,1.849609,-0.223607,0.0,-0.13484,-0.953463,-0.427900
235,3.104418,1.487215,2.497463,-1.433589,1.849609,-0.512989,-0.109764,-0.077382,0.0,-0.942147,...,0.0,-0.077382,-0.175142,-0.289605,-0.540655,-0.223607,0.0,-0.13484,-0.953463,2.336993


In [14]:
X_train.shape

(168, 21)

In [15]:
y_train

182    0
59     5
168    4
221    4
138    1
      ..
137    1
72     1
140    2
235    5
37     3
Name: Star type, Length: 168, dtype: int64

In [16]:
# test data evaluation

In [20]:
model = LogisticRegression()
model.fit(X_train, y_train)

print("Test Set Accuracy: {:.2f}%".format(model.score(X_test, y_test) * 100))

Test Set Accuracy: 94.44%


In [21]:
# k fold evaluation

In [22]:
kf = KFold(n_splits=5)

print("Split Indices")

for i, (train_idx, test_idx) in enumerate(kf.split(X_train)):
    print(f"\nSplit {i + 1}:\n--------")
    print("\nTrain:\n" + str(train_idx))
    print("\nTest:\n" + str(test_idx) + "\n")

Split Indices

Split 1:
--------

Train:
[ 34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69
  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87
  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105
 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123
 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141
 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
 160 161 162 163 164 165 166 167]

Test:
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33]


Split 2:
--------

Train:
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  68  69
  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87
  88  89  90  91  92  93  94  95  96  97  98  99 100 10

In [23]:
results = []

for train_idx, test_idx in kf.split(X_train):
    train_set = (X_train.iloc[train_idx, :], y_train.iloc[train_idx])
    test_set = (X_train.iloc[test_idx, :], y_train.iloc[test_idx])
    
    model = LogisticRegression()
    model.fit(train_set[0], train_set[1])
    results.append(model.score(test_set[0], test_set[1]))

print("K-Fold Accuracies:")
for i, result in enumerate(results):
    print("Model {}: {:.2f}%".format(i + 1, result * 100))

K-Fold Accuracies:
Model 1: 100.00%
Model 2: 97.06%
Model 3: 100.00%
Model 4: 93.94%
Model 5: 96.97%


In [26]:
print("Average K-Fold Accuracy: {:.2f}%".format(np.array(results).mean() * 100))

Average K-Fold Accuracy: 97.59%
