In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale as normalize
from sklearn.metrics import confusion_matrix, accuracy_score

np.set_printoptions(suppress=True)
pd.set_option("display.max_columns", None, "display.max_rows", None)

plt.style.use('seaborn-darkgrid')
plt.rcParams['font.family']='serif'

In [None]:
df = pd.read_csv('https://github.com/thissop/MAXI-J1535/raw/main/data/processed/fixed_merged.csv')
df = df.sample(frac=1)

In [None]:
# pairplot 
pair_df = df[['hardness','tins','disk_norm','gammas','nthcomp_norms','fit_stats','red_fit_stats','count_rates','bg_ratio','intensities','num_qpos', 'red_fit_stats']]
print(pair_df)
#sns.pairplot(data=pair_df) # Not working fix this

In [None]:
# Correlation matrix plot
corr_df = df[['hardness','tins','disk_norm','gammas','nthcomp_norms','fit_stats','red_fit_stats','count_rates','bg_ratio','intensities','num_qpos']].corr()

sns.heatmap(corr_df, cmap='Blues')

plt.title('Correlation Matrix\nMost Correlated: disk_norm ('+r'$r^2=$'+'0.89)')

print(corr_df)

In [None]:
# X

arr_names =  ['hardness', 'tins', 'disk_norm', 'gammas', 'nthcomp_norms', 'intensities']
(hardness, tins, disk_norms, gammas, nthcomp_norms, intensities) = (normalize(np.array(df[arr_name])) for arr_name in arr_names)

# Y
num_qpos = np.array(df['num_qpos']).reshape(len(df), 1)
## Stack x values
x_vals = np.array([], dtype=np.int64).reshape(0,6)

for a, b, c, d, e, f in zip(hardness, tins, disk_norms, gammas, nthcomp_norms, intensities): 
    new_arr = np.array([float(a), float(b), float(c), float(d), float(e), float(f)])
    x_vals = np.vstack([x_vals, new_arr])

# Train test split
X_train, X_test, y_train, y_test = train_test_split(x_vals, num_qpos, test_size=0.1)

In [None]:
def knn_predict(k, xtrain, ytrain, xtest, ytest): 
    knn = KNeighborsClassifier(k)
    knn.fit(xtrain, ytrain)
    predictions = knn.predict(xtest)
    acc = accuracy_score(predictions, ytest.flatten())

    return acc, predictions

In [None]:
ks = np.arange(1, 31, 1)
accs = np.array([])

for k in ks: 
    acc = knn_predict(k, X_train, y_train, X_test, y_test)[0]
    accs = np.append(accs, acc)

In [None]:
# k vs acc plot
plt.scatter(ks, accs)
plt.xlabel('k')
plt.ylabel('acc')
plt.title('K vs Accuracy\nMost Accurate: k=1-3; acc=0.84')

In [None]:
# k=3 confusion matrix
y_test = y_test.flatten()

acc, predictions = knn_predict(3, X_train, y_train, X_test, y_test)

predictions = predictions.flatten()

fig, ax = plt.subplots(figsize=(10, 8))
plt.rcParams['font.family']='serif'

cm = confusion_matrix(y_test.flatten(), predictions)

sns.heatmap(cm, annot=True, cmap='Blues', linewidths=.5)
plt.xlabel('Actual')
plt.ylabel('Predicted')

plt.title('Confusion Matrix\nAccuracy: '+str(round(acc, 3)))
plt.show()