In [1]:
import numpy as np
import pandas as pd
import sklearn

In [2]:
df = pd.read_csv('https://gist.githubusercontent.com/urwinday/2c9901cff2cdb0c47a291b34a0db2805/raw/ebb0d489d6b0f78c81403c1a4d54ad54397e4d8c/Skyserver_SQL2_27_2018%25206_51_39%2520PM.csv')
df.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,camcol,field,specobjid,class,redshift,plate,mjd,fiberid
0,1.23765e+18,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,752,301,4,267,3.72236e+18,STAR,-9e-06,3306,54922,491
1,1.23765e+18,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,752,301,4,267,3.63814e+17,STAR,-5.5e-05,323,51615,541
2,1.23765e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,4,268,3.23274e+17,GALAXY,0.123111,287,52023,513
3,1.23765e+18,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,752,301,4,269,3.72237e+18,STAR,-0.000111,3306,54922,510
4,1.23765e+18,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,752,301,4,269,3.72237e+18,STAR,0.00059,3306,54922,512


In [3]:
df['class'] = df['class'].map({'STAR': 1, 'GALAXY': 2, 'QSO': 3})

In [4]:
labels = df['class'] # y  - целевой признак
features = df.drop('class', 1) # X -  матрица признаков
features.shape, labels.shape

((10000, 17), (10000,))

In [5]:
from sklearn.model_selection import train_test_split 
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state = 42)
features_train.shape, features_test.shape, labels_train.shape, labels_test.shape

((8000, 17), (2000, 17), (8000,), (2000,))

In [6]:
from sklearn.preprocessing import StandardScaler 

In [7]:
scale = StandardScaler()
features_train_scaled = scale.fit_transform(features_train)
features_test_scaled = scale.transform(features_test)

In [8]:
from sklearn.neighbors import KNeighborsClassifier

In [9]:
scale_first_knn = KNeighborsClassifier()
scale_first_knn.fit(features_train_scaled, labels_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [10]:
predictions_scaled = scale_first_knn.predict(features_test_scaled)

In [11]:
from sklearn.metrics import accuracy_score
accuracy_scaled = accuracy_score(labels_test, predictions_scaled)
accuracy_scaled

0.8935

###  Использование sklearn.pipeline.Pipeline

In [12]:
import sklearn.pipeline

In [13]:
knn_pipeline = sklearn.pipeline.Pipeline([
    ('scaler',StandardScaler()),
    ('kNN', KNeighborsClassifier(n_jobs=-1))
])

In [14]:
knn_pipeline.fit(features_train,labels_train)

Pipeline(memory=None,
     steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kNN', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
           weights='uniform'))])

In [15]:
y_preds = knn_pipeline.predict(features_test)

In [16]:
accuracy_pipeline = accuracy_score(labels_test, y_preds)
accuracy_pipeline

0.8935

### 1.3. Подбор гиперпараметра KNeighborsClassifier (n_neighbors) с помощью GridSearchCV   
- вывод значение гиперпараметра 
- вывод метрики для наилучшей модели

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
first_knn = KNeighborsClassifier()
first_knn.fit(features_train_scaled, labels_train);

In [19]:
knn_params = {'n_neighbors': range(5,30,5)} 
knn_grid = GridSearchCV(first_knn, knn_params, cv =5)
knn_grid.fit( features_train_scaled, labels_train );

In [20]:
knn_grid.best_score_, knn_grid.best_params_

(0.9005, {'n_neighbors': 5})