In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
filename = '/kaggle/input/star-dataset/6 class csv.csv'
df = pd.read_csv(filename) 

In [3]:
df.head() # look at data to see if it needs processing

Unnamed: 0,Temperature (K),Luminosity(L/Lo),Radius(R/Ro),Absolute magnitude(Mv),Star type,Star color,Spectral Class
0,3068,0.0024,0.17,16.12,0,Red,M
1,3042,0.0005,0.1542,16.6,0,Red,M
2,2600,0.0003,0.102,18.7,0,Red,M
3,2800,0.0002,0.16,16.65,0,Red,M
4,1939,0.000138,0.103,20.06,0,Red,M


# Process the data before training the model. The Spectral Class and Star color columns have string data that needs to be converted to integers before we can begin training the model.

In [4]:
# replace string data with numbers
df['Spectral Class'] = df['Spectral Class'].replace({'M': 0, 'B': 1, 'O': 2, 'A': 3, 'F': 4, 'K': 5, 'G': 6})

In [5]:
# Convert all to uppercase
uppercase = df['Star color'].apply(lambda x: x.upper())

# Replace spaces with dashes
stripped = uppercase.apply(lambda x: x.strip())
processed = stripped.apply(lambda x: x.replace(" ","-"))

df['Star color'] = processed

In [6]:
print(df['Star color'].unique())

['RED' 'BLUE-WHITE' 'WHITE' 'YELLOWISH-WHITE' 'PALE-YELLOW-ORANGE' 'BLUE'
 'WHITISH' 'YELLOW-WHITE' 'ORANGE' 'WHITE-YELLOW' 'YELLOWISH' 'ORANGE-RED']


In [7]:
# create dictionary to replace string data
count = 0
color_numeric = {}
for color in df['Star color'].unique():
    color_numeric[color] = count
    count += 1
    
# replace string data with integers
df['Star color'] = df['Star color'].replace(color_numeric)

In [8]:
# split features into training and testing sets
from sklearn.model_selection import train_test_split

x = df.drop('Star type', axis=1)
y = df['Star type']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10)

# Begin training model. We choose a Random Forest Classifier for its strength in handling classification tasks.

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [10]:
clf.fit(x_train, y_train)

RandomForestClassifier()

In [11]:
# Make predictions on the test set
y_pred = clf.predict(x_test)

In [12]:
from sklearn.metrics import accuracy_score
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 1.0


### The model has an accuracy of 1.0! It is now trained and ready to make predictions.

# Make a prediction for the Sun with the trained model.

In [13]:
sun_data = {'Temperature (K)': 5778,'Luminosity(L/Lo)': 1, 'Radius(R/Ro)': 1, 'Absolute magnitude(Mv)': 4.83, 'Star color': 10, 'Spectral Class': 6}
sun = pd.DataFrame(sun_data, index=[0])

In [14]:
sun_pred = clf.predict(sun)
print(sun_pred)

[3]


### The model accurately predicted the sun's star type. It is a main sequence star, so we got a prediction of 3

# Now, we will find the line of data that is the most similar to the sun. We will do this using sklearn's NearestNeighbor's class.

In [15]:
# Process the data
sun['Star type'] = [3]
sun = sun.reindex(columns=['Temperature (K)', 'Luminosity(L/Lo)', 'Radius(R/Ro)', 'Absolute magnitude(Mv)', 'Star type', 'Star color', 'Spectral Class'])

In [16]:
from sklearn.neighbors import NearestNeighbors

# Create an instance of the NearestNeighbors class
nn = NearestNeighbors(n_neighbors=10, algorithm='ball_tree')

# Fit the model to the dataset
nn.fit(df,df.columns)

# Find the closest data points to the input
input_data = sun
distances, indices = nn.kneighbors(input_data)

print(indices) # indices of the closest data point

[[34 38 39 90 37 96 93 36 91 80]]


In [17]:
print(df.loc[34])

Temperature (K)           5800.00
Luminosity(L/Lo)             0.81
Radius(R/Ro)                 0.90
Absolute magnitude(Mv)       5.05
Star type                    3.00
Star color                   7.00
Spectral Class               4.00
Name: 34, dtype: float64


## We have found the most similar star to the sun.