# Нейросетевой анализ

Мы приведём два варианта нейросетевого анализа набора данных. Второй заключается в построении свёрточной нейронной сети по изображениям. 

Импортируем инструменты

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, Dense, Flatten, Input, MaxPooling2D

Импортируем данные

In [2]:
data = pd.read_parquet("images.parquet")
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55322 entries, 0 to 55321
Columns: 7501 entries, red_0_0 to target
dtypes: float64(7501)
memory usage: 3.1 GB


Unnamed: 0,red_0_0,red_0_1,red_0_2,red_0_3,red_0_4,red_0_5,red_0_6,red_0_7,red_0_8,red_0_9,...,blue_49_41,blue_49_42,blue_49_43,blue_49_44,blue_49_45,blue_49_46,blue_49_47,blue_49_48,blue_49_49,target
0,0.117676,0.117676,0.117676,0.117676,0.117676,0.117676,0.117676,0.117676,0.117676,0.117676,...,0.572754,0.554199,0.442627,0.434814,0.451172,0.470459,0.487793,0.505859,0.53418,1.0
1,0.846191,0.865234,0.97168,0.981934,0.987305,0.972656,0.990723,0.954102,0.883789,0.947266,...,0.035309,0.003922,0.027451,0.075989,0.111267,0.114197,0.103943,0.096069,0.0755,1.0
2,0.974121,0.951172,0.700195,0.699707,0.750977,0.737793,0.98291,0.921875,0.877441,0.683105,...,0.894043,0.851074,0.866699,0.886719,0.890137,0.890137,0.890137,0.890137,0.890137,1.0
3,0.723633,0.723633,0.650879,0.719727,0.74707,0.770508,0.780273,0.772461,0.735352,0.723633,...,0.519043,0.513184,0.508789,0.535156,0.304443,0.0233,0.03302,0.081177,0.2323,1.0
4,0.799316,0.810547,0.820312,0.804688,0.770996,0.742188,0.724609,0.712402,0.703613,0.702148,...,0.297607,0.471436,0.501465,0.48999,0.43335,0.37793,0.437744,0.484863,0.474854,1.0


Изображения в растянутом виде, необходимо привести их в исходный вид, чтобы использовать свёрточную нейронную сеть. Чтобы грамотно составить набор входных данных, нам необходимо организовать правильный порядок столбцов. 

In [3]:
%%time

color_names = ["red", "green", "blue"]
columns=[f"{color}_{i}_{j}" for i in range(50) for j in range(50) for color in color_names]

X = data[columns].values
y = np.array(data["target"].values.reshape(-1, 1), dtype=np.int32)
X.shape, y.shape

Wall time: 3.56 s


((55322, 7500), (55322, 1))

Разбиваем выборку на обучающую и тестовую 

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

Стандартизируем данные

In [5]:
ss = StandardScaler().fit(X_train)

X_train_scaled = ss.transform(X_train).reshape(-1, 50, 50, 3)
X_test_scaled = ss.transform(X_test).reshape(-1, 50, 50, 3)
X_train_scaled.shape, X_test_scaled.shape

((37065, 50, 50, 3), (18257, 50, 50, 3))

Определяем модель

In [6]:
model = Sequential(name="cnn")

model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(50, 50, 3)))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(16, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Conv2D(8, (3, 3), activation='relu'))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation="sigmoid", name="output_layer", dtype=np.float64))

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

Проводим обучение

In [7]:
model.fit(X_train_scaled, y_train, epochs=5, batch_size=32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x12f77c97808>

Проверяем качество

In [8]:
y_pred = model.predict(X_test_scaled).flatten()

In [9]:
accuracy = accuracy_score(y_test.flatten(), np.array(y_pred > 0.5, dtype=np.int16))
roc_auc = roc_auc_score(y_test.flatten(), y_pred)
print("accuracy:{:4.6f}\nroc_auc:{:4.6f}".format(accuracy, roc_auc))

accuracy:0.939859
roc_auc:0.985967


**Выводы:** Лучший результат показала свёрточная нейронная сеть (94%), на втором месте полносвязная нейронная сеть (88%), базовые модели показали качество (82%)