In [100]:
import pandas as pd
import numpy as np
import os
from copy import deepcopy

import matplotlib.pyplot as plt
import seaborn as sns

import pint
from pint import UnitRegistry

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

import torch
import torch.nn as nn

import torch.nn.functional as F
import torch.optim as optim

In [147]:
def to_CI(df):
    def celc(x):
        ureg = UnitRegistry()
        Q_ = ureg.Quantity
        home = Q_(x, ureg.degC)
        return home.to('degK')

    def ream(x):
        ureg = UnitRegistry()
        Q_ = ureg.Quantity
        home = Q_(x, ureg.degR)
        return home.to('degK')
    
    celc_features = ['ZT1AB', 'ZTNAC', 'ZTOIL', 'ZT1A', 'GEGTMC', 'ZTNAC_D']
    ream_features = ['ZTAMB']
    
    for cl in celc_features:
        if cl in df.columns.to_list():
            df[cl] = pd.Series(celc(df[cl].values))
            
    for cl in ream_features:
        if cl in df.columns.to_list():
            df[cl] = pd.Series(ream(df[cl].values))
    
    return df

def preprocess_file(df, corr_ther):
    def delete_corr(df, ther):
        was_corr = []
        corr = df.corr()
        for row in corr.iterrows():
            for v_ind in range(len(row[1])):
                if row[1][v_ind] > ther and row[1][v_ind] < 1: #ПЕРЕСЧИТАТЬ
                    if (row[0] not in was_corr) and (row[1].index[v_ind] not in was_corr):
                        print(row[0], row[1].index[v_ind], row[1][v_ind])
                        was_corr.append(row[0])
        print(len(was_corr))
        return was_corr
    #удалить лишние
    df = df.drop(['flight_datetime', 'engine_id'], axis = 1)
    
    #виды фичей
    to_categorical = ['number_blades', 'engine_position', 'engine_family', 
                  'manufacturer', 'aircraft_family', 'aircraft_type', 'aircraft_grp',
                  'ac_manufacturer']
    numerical = list(set(df.columns.to_list()) - set(to_categorical))
    
    #убрать те у которых больше 2/3 пропущены значения
    for cl in numerical:
        if (cl in df.columns.to_list()) and (len(df[cl]) * 2 / 3 < df[cl].isna().sum()):
            df = df.drop([cl], axis = 1)
    #обновить список числовых фияей
    numerical = list(set(df.columns.to_list()) - set(to_categorical))
    #заполнить оставшиеся пропуски нулями
    df = df.fillna(0)
    
    #перевести в систему СИ
    df = to_CI(df)

    #убрать скоррелированные фичи
    was_corr = delete_corr(df[numerical], corr_ther)
    df.drop(was_corr, axis = 1)
    #обновить список числовых фияей
    numerical = list(set(df.columns.to_list()) - set(to_categorical))
    
    #скалировать данные
    scaled_features = StandardScaler().fit_transform(df[numerical].values)
    scaled_features_df = pd.DataFrame(scaled_features, index=df[numerical].index, columns=df[numerical].columns)
    
    scaled_features_df
    #исправить типы данных категориальных фичей
    for cl in to_categorical:
        df[cl] = df[cl].astype(object)
        #One Hot Encoding
        one_hot = pd.get_dummies(df[cl])
        scaled_features_df = scaled_features_df.join(one_hot)
    
    return scaled_features_df

In [148]:
df = pd.read_csv("../data/takeoff_CFM56-7B27-B1.csv")       

to_categorical = ['number_blades', 'engine_position', 'engine_family', 
                  'manufacturer', 'aircraft_family', 'aircraft_type', 'aircraft_grp',
                  'ac_manufacturer', 'flight_datetime', 'engine_id']

numerical = list(set(df.columns.to_list()) - set(to_categorical))

df = preprocess_file(df, corr_ther = 0.95)

PCN12I ZPCN12 0.9970258301523474
ZXM CAS 0.9735949806805446
ZPCN12 PCN12 0.9961382830347263
EGTHDM_D SLOATL_D 0.9998878735220914
DELFN DELN1 0.9933118805746719
PCN1AR PCN1BR 0.9992539707243598
6


  data = np.array(data, copy=copy)
  one_hot = pd.get_dummies(df[cl])
  one_hot = pd.get_dummies(df[cl])


In [149]:
# Целевые переменные

y = pd.read_csv("../data/y.csv")
targets = y.columns.to_list()
tr = ["engine_id", "flight_datetime", "flight_phase"]
for t in tr:
    targets.remove(t)

tr = deepcopy(targets)
for t in tr:
    if t not in df.columns.to_list():
        targets.remove(t)
        
features = list(set(df.columns.to_list()) - set(targets))

In [150]:
targets

['BRAT',
 'DELFN',
 'DELN1',
 'EGTHDM',
 'EGTHDM_D',
 'PCN12',
 'PCN12I',
 'PCN1AR',
 'PCN1BR',
 'PCN1K',
 'SLOATL',
 'SLOATL_D',
 'ZPCN25_D',
 'ZT49_D']

### Train test

In [151]:
df[features].columns

Index([                  1,                   2,               'AGW',
                   'ZVB1F',           'CFM56-7',              'B737',
                      24.0,             'ZVB1R',       'n1_modifier',
                    'ZT49',              'ZT1A',               'ZXM',
                  'ZPCN12',               'IBP',             'ZVB2R',
                'B737-800',          'B737-83N', 'CFM INTERNATIONAL',
                   'IVS12',            'BOEING',               'IBE',
                     'CAS',              'IAIE',            'ZPCN25',
                   'ZVB2F',               'SAT',               'IAI',
                    'ZALT'],
      dtype='object')

In [205]:
X = df[features].values
y = df['ZPCN25'].values.reshape(y.shape[0], 1)

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [207]:
class MyDataset:
    def __init__(self, features, targets):
        self.features = features
        self.labels = targets

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        feat_ = torch.tensor(self.features[idx], dtype=torch.float)
        label_ = torch.tensor(self.labels[idx], dtype=torch.float)
        
        return feat_, label_

In [208]:
training_set = MyDataset(X_train, y_train)
training_loader = torch.utils.data.DataLoader(training_set, batch_size = 32)

val_set = MyDataset(X_test, y_test)
val_loader = torch.utils.data.DataLoader(val_set, batch_size = 32)

In [209]:
X_train.shape[1] * 1.5

42.0

### Модель

In [210]:
class NN(nn.Module):
    def __init__(self):
        super(NN,self).__init__()
        self.layer1=nn.Linear(X_train.shape[1], X_train.shape[1] * 3)
        self.layer2=nn.Linear(X_train.shape[1] * 3, X_train.shape[1] * 2)
        self.layer3=nn.Linear(X_train.shape[1] * 2, X_train.shape[1] // 2)
        self.layer4=nn.Linear(X_train.shape[1] // 2, y_train.shape[1])
        
    def forward(self,x):
        x=F.relu(self.layer1(x))
        x=F.relu(self.layer2(x))
        x=F.relu(self.layer3(x))
        x=self.layer4(x)
        return x       

In [218]:
model = NN()

optimizer = optim.SGD(model.parameters(), lr=3e-2)
loss_fn = nn.MSELoss()

In [219]:
epochs=10
for i in range(epochs):
    for x_batch, y_batch in training_loader:
        #initialize the model parameter
        optimizer.zero_grad(set_to_none = True)
        #calculate the loss
        output = model.forward(x_batch)
        loss = loss_fn(output, y_batch)
        #backpropagation
        loss.backward()
        #update the parameters
        optimizer.step()
    if(i % 5 == 0):
        print(f"epochs: {i}......loss:{loss}")

epochs: 0......loss:0.17102083563804626
epochs: 5......loss:0.08467797189950943


In [220]:
y_train_pred = model(torch.tensor(X_train,dtype=torch.float32,requires_grad=True))
y_test_pred = model(torch.tensor(X_test,dtype=torch.float32))

#convert to numpy array
y_train_pred = y_train_pred.detach().numpy()
y_test_pred = y_test_pred.detach().numpy()

In [221]:
test_metric = sqrt(mean_squared_error(y_test, y_test_pred))
train_metric = sqrt(mean_squared_error(y_test, y_test_pred))
print('train', round(train_metric, 3))
print('test', round(test_metric, 3))

train 0.163
test 0.163
