In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [3]:
df = pd.read_csv('G:\\diploma\\data.csv', index_col = 0)
df.head(3)

Unnamed: 0,Соотношение матрица-наполнитель,"Плотность, кг/м3","модуль упругости, ГПа","Количество отвердителя, м.%","Содержание эпоксидных групп,%_2","Температура вспышки, С_2","Поверхностная плотность, г/м2","Модуль упругости при растяжении, ГПа","Прочность при растяжении, МПа","Потребление смолы, г/м2","Угол нашивки, град",Шаг нашивки,Плотность нашивки
1,1.857143,2030.0,738.736842,50.0,23.75,284.615385,210.0,70.0,3000.0,220.0,0.0,4.0,60.0
3,1.857143,2030.0,738.736842,129.0,21.25,300.0,210.0,70.0,3000.0,220.0,0.0,5.0,47.0
4,2.771331,2030.0,753.0,111.86,22.267857,284.615385,210.0,70.0,3000.0,220.0,0.0,5.0,57.0


# Прогноз модуля упругости при растяжении

In [4]:
#подготовка выборок
mms = MinMaxScaler()
df_norm = pd.DataFrame(mms.fit_transform(df), columns=df.columns, index=df.index)
X = df_norm.drop('Модуль упругости при растяжении, ГПа', axis=1)
Y = df_norm['Модуль упругости при растяжении, ГПа']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
print(f'Размер обучающей выборки: {X_train.shape[0]}')
print(f'Размер тестовой выборки: {X_test.shape[0]}')

Размер обучающей выборки: 645
Размер тестовой выборки: 277


Линейная регрессия

In [5]:
%%time
lnr = LinearRegression()
lnr.fit(X_train, y_train)
#y_pred = lnr.predict(X_test)

CPU times: total: 31.2 ms
Wall time: 20.5 ms


Случайный лес

In [6]:
%%time
rfr = RandomForestRegressor()
param_grid = {
   'n_estimators': [20, 40, 60],
   'max_features': [1.0, 'sqrt', 'log2'],
   'max_depth' : [3,4,5,6]
}
GSCV = GridSearchCV(estimator=rfr, param_grid=param_grid, cv=10)
GSCV.fit(X_train, y_train)
print(GSCV.best_params_)
rfr = GSCV.best_estimator_

{'max_depth': 4, 'max_features': 'log2', 'n_estimators': 20}
CPU times: total: 1min 33s
Wall time: 1min 34s


k-ближайших

In [7]:
%%time
knnr= KNeighborsRegressor()
param_grid = {
   'n_neighbors': range(1, 350, 2),
}
GSCV = GridSearchCV(estimator=knnr, param_grid=param_grid, cv=10)
GSCV.fit(X_train, y_train)
print(GSCV.best_params_)
knnr = GSCV.best_estimator_

{'n_neighbors': 61}
CPU times: total: 59.8 s
Wall time: 31.6 s


In [8]:
%%time
SVMr = SVR()
param_grid = {
    #'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'epsilon': np.arange(0.1, 3.0, 0.2),
}
GSCV = GridSearchCV(estimator=SVMr, param_grid=param_grid, cv=10)
GSCV.fit(X_train, y_train)
print(GSCV.best_params_)
SVMr = GSCV.best_estimator_

{'C': 0.01, 'epsilon': 0.1}
CPU times: total: 11.5 s
Wall time: 11.4 s


In [9]:
%%time
gbr = GradientBoostingRegressor()
param_grid = {
    #'n_estimators': range(50, 200, 5),
    #'max_depth': range(1, 10, 1),
    'n_estimators': [5, 30, 50],
    'max_depth': [1, 5, 10],
}
GSCV = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=10)
GSCV.fit(X_train, y_train)
print(GSCV.best_params_)
gbr = GSCV.best_estimator_

{'max_depth': 1, 'n_estimators': 50}
CPU times: total: 41 s
Wall time: 42.5 s


In [10]:
models = [lnr, rfr, knnr, SVMr, gbr]

In [11]:
for i in models:
    i.fit(X_train, y_train)

In [12]:
mae = []
mse = []
for i in models:
    mae.append(mean_absolute_error(i.predict(X_test), y_test))
    mse.append(mean_squared_error(i.predict(X_test), y_test))

In [13]:
res = {'Модель': ['Линейная регрессия', 'Случайный лес', 'k-ближайших', 'Метод опорных векторов', 'Градиентный бустинг'], 
       'MAE'   : [i for i in mae], 
       'MSE'   : [i for i in mse], 
       }
pd.DataFrame(res)

Unnamed: 0,Модель,MAE,MSE
0,Линейная регрессия,0.161937,0.039564
1,Случайный лес,0.161887,0.039612
2,k-ближайших,0.161696,0.039254
3,Метод опорных векторов,0.160573,0.039016
4,Градиентный бустинг,0.161973,0.039844


# Прогноз прочности при растяжении

In [14]:
X1 = df_norm.drop('Прочность при растяжении, МПа', axis=1)
Y1 = df_norm['Прочность при растяжении, МПа']
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, Y1, test_size=0.3, random_state=1)
print(f'Размер обучающей выборки: {X1_train.shape[0]}')
print(f'Размер тестовой выборки: {X1_test.shape[0]}')

Размер обучающей выборки: 645
Размер тестовой выборки: 277


In [15]:
%%time
lnr_1 = LinearRegression()
lnr_1.fit(X1_train, y1_train)
#y_pred = lnr.predict(X_test)

CPU times: total: 0 ns
Wall time: 7.81 ms


In [16]:
%%time
rfr_1 = RandomForestRegressor()
param_grid = {
   'n_estimators': [20, 40, 60],
   'max_features': [1.0, 'sqrt', 'log2'],
   'max_depth' : [3,4,5,6]
}
GSCV = GridSearchCV(estimator=rfr_1, param_grid=param_grid, cv=10)
GSCV.fit(X1_train, y1_train)
print(GSCV.best_params_)
rfr_1 = GSCV.best_estimator_

{'max_depth': 3, 'max_features': 'log2', 'n_estimators': 20}
CPU times: total: 1min 34s
Wall time: 1min 39s


In [17]:
%%time
knnr_1= KNeighborsRegressor()
param_grid = {
   'n_neighbors': range(1, 350, 2),
}
GSCV = GridSearchCV(estimator=knnr_1, param_grid=param_grid, cv=10)
GSCV.fit(X1_train, y1_train)
print(GSCV.best_params_)
knnr_1 = GSCV.best_estimator_

{'n_neighbors': 243}
CPU times: total: 59.4 s
Wall time: 32.2 s


In [18]:
%%time
SVMr_1 = SVR()
param_grid = {
    #'kernel' : ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'epsilon': np.arange(0.1, 3.0, 0.2),
}
GSCV = GridSearchCV(estimator=SVMr_1, param_grid=param_grid, cv=10)
GSCV.fit(X1_train, y1_train)
print(GSCV.best_params_)
SVMr_1 = GSCV.best_estimator_

{'C': 0.01, 'epsilon': 0.30000000000000004}
CPU times: total: 11.3 s
Wall time: 11.5 s


In [19]:
%%time
gbr_1 = GradientBoostingRegressor()
param_grid = {
    #'n_estimators': range(50, 200, 5),
    #'max_depth': range(1, 10, 1),
    'n_estimators': [5, 30, 50],
    'max_depth': [1, 5, 10],
}
GSCV = GridSearchCV(estimator=gbr_1, param_grid=param_grid, cv=10)
GSCV.fit(X1_train, y1_train)
print(GSCV.best_params_)
gbr_1 = GSCV.best_estimator_

{'max_depth': 5, 'n_estimators': 5}
CPU times: total: 42.3 s
Wall time: 43.7 s


In [20]:
models_1 = [lnr_1, rfr_1, knnr_1, SVMr_1, gbr_1]

In [21]:
for i in models_1:
    i.fit(X1_train, y1_train)

In [22]:
mae_1 = []
mse_1 = []
for i in models_1:
    mae_1.append(mean_absolute_error(i.predict(X1_test), y1_test))
    mse_1.append(mean_squared_error(i.predict(X1_test), y1_test))

In [23]:
res_1 = {'Модель': ['Линейная регрессия', 'Случайный лес', 'k-ближайших', 'Метод опорных векторов', 'Градиентный бустинг'], 
         'MAE'   : [i for i in mae_1],
         'MSE'   : [i for i in mse_1],
       }
pd.DataFrame(res_1)

Unnamed: 0,Модель,MAE,MSE
0,Линейная регрессия,0.149937,0.034523
1,Случайный лес,0.150998,0.034852
2,k-ближайших,0.152226,0.0353
3,Метод опорных векторов,0.151112,0.034843
4,Градиентный бустинг,0.152481,0.035355


In [24]:
import pickle

In [28]:
pickle.dump(lnr, open('lnr_model.pkl', 'wb'))
pickle.dump(lnr_1, open('lnr_1_model.pkl', 'wb'))