In [90]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer

In [91]:
def get_probabilty(model, row):
    ''' Функция возвращает по найденному уравнению разделяющей плоскости вероятность объекта быть идеальным
    x - это признаки данного объекта
    b - вектор коэффициентов в уравнении плоскости '''
    des = model.decision_function([row])
    return 1 / (1 + np.exp(-des[0]))

Делим исходные данные на признаки и ответы, а также разделяем на данные, на которых обучаем и на которых тестируем

In [92]:
df = pd.read_csv('../assets/train.csv')
df.loc[df['result'] == 0.0, 'result'] = -1.0

X = df.loc[:, :'pollutedPointsPercentage']
y = df.loc[:, 'result']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
copy_of_x_test = x_test.copy()
df

Unnamed: 0,areasAmountPer100k,sportsAmountPer100k,areasSquarePer100k,subwayDistance,pollutedPointsPercentage,result
0,15.0,20.0,1.868616e+05,7353.397854,0.948230,-1.0
1,331.0,317.0,1.367124e+06,3165.050693,0.024175,1.0
2,300.0,77.0,3.322908e+05,6801.641570,0.173018,-1.0
3,127.0,42.0,6.703259e+04,8076.082570,0.891446,-1.0
4,405.0,148.0,1.026126e+06,3440.022194,0.275477,1.0
...,...,...,...,...,...,...
11995,261.0,450.0,1.181398e+06,501.172268,0.553239,1.0
11996,279.0,54.0,1.182710e+06,6757.492280,0.702291,-1.0
11997,19.0,48.0,2.609569e+05,7960.901843,0.990480,-1.0
11998,342.0,160.0,1.253940e+06,274.843711,0.465364,1.0


Стандартизация данных

In [93]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform([[101, 158.7, 228583, 700, 0.05]])
print(x_test)

[[-0.8972286  -0.14051444 -0.89518477 -1.4527039  -1.7624411 ]]




Объявление и обучение модели

In [94]:
model = svm.SVC(kernel='linear', C = 1.0, probability=True)
model.fit(x_train, y_train)

SVC(kernel='linear', probability=True)

Применение модели

In [95]:
probabilities = np.array([])
x_test = scaler.transform(np.array([[55.77658176583015, 111.5531635316603, 129439.50481135711, 500, 0.05]]))
# x_test = np.array(x_test)

for x in x_test:
   prob = get_probabilty(model, list(x)) * 100
   probabilities = np.append(probabilities, prob)



In [96]:
np.savetxt("../assets/final_result.csv", probabilities, delimiter=",", fmt='%10.5f')
# "areasAmountPer100k":106.48354666492484
# sportsObjectsAmountPer100k":122.2588
# areasSquarePer100k":567272.50481135711
# dist: 500
# 0.05


Экспортирование модели в папку `/models`

In [97]:
import joblib
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(model, '../models/model.pkl')

['../models/model.pkl']