In [17]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer

In [18]:
def get_probabilty(model, row):
    ''' Функция возвращает по найденному уравнению разделяющей плоскости вероятность объекта быть идеальным
    x - это признаки данного объекта
    b - вектор коэффициентов в уравнении плоскости '''
    des = model.decision_function([row])
    return 1 / (1 + np.exp(-des[0]))

Делим исходные данные на признаки и ответы, а также разделяем на данные, на которых обучаем и на которых тестируем

In [19]:
df = pd.read_csv('../assets/train.csv')
df.loc[df['result'] == 0.0, 'result'] = -1.0

X = df.loc[:, :'pollutedPointsPercentage']
y = df.loc[:, 'result']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
copy_of_x_test = x_test.copy()
df

Unnamed: 0,areasAmountPer100k,sportsAmountPer100k,areasSquarePer100k,subwayDistance,pollutedPointsPercentage,result
0,82.0,1.0,3.662551e+04,6748.109629,0.625161,-1.0
1,365.0,153.0,1.299719e+06,653.996848,0.328213,1.0
2,55.0,141.0,4.593430e+05,7392.874779,0.052311,-1.0
3,68.0,14.0,1.722896e+05,9807.668684,0.986872,-1.0
4,284.0,166.0,6.724441e+05,2070.658623,0.483438,1.0
...,...,...,...,...,...,...
11995,578.0,373.0,1.213305e+06,996.556086,0.518384,1.0
11996,132.0,272.0,1.212068e+06,348.477926,0.422045,-1.0
11997,74.0,39.0,2.098838e+05,5673.405192,0.941627,-1.0
11998,346.0,330.0,1.341419e+06,708.361666,0.382756,1.0


Стандартизация данных

In [20]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform([[101, 158.7, 228583, 700, 0.05]])
print(x_test)

[[-0.7755039  -0.14441707 -0.78541934 -1.46278748 -1.65749664]]




Объявление и обучение модели

In [21]:
model = svm.SVC(kernel='linear', C = 1.0, probability=True)
model.fit(x_train, y_train)

SVC(kernel='linear', probability=True)

Применение модели

In [22]:
probabilities = np.array([])
x_test = scaler.transform(np.array([[55.77658176583015, 111.5531635316603, 129439.50481135711, 500, 0.05]]))
# x_test = np.array(x_test)

for x in x_test:
   prob = get_probabilty(model, list(x)) * 100
   probabilities = np.append(probabilities, prob)



In [23]:
np.savetxt("../assets/final_result.csv", probabilities, delimiter=",", fmt='%10.5f')
# "areasAmountPer100k":106.48354666492484
# sportsObjectsAmountPer100k":122.2588
# areasSquarePer100k":567272.50481135711
# dist: 500
# 0.05


Экспортирование модели в папку `/models`

In [24]:
import joblib
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(model, '../models/model.pkl')

['../models/model.pkl']