In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import joblib


In [13]:
df = pd.read_excel("raw/train.xlsx", decimal=',')
to_delete = []
df.drop(columns=to_delete, inplace=True, errors='ignore')

df.head()

Unnamed: 0,I1,I2,I3,gx,gy,gz,ax,ay,az,V1real,V2real,V3real,N1,N2,N3,Type
0,1.126697,0.021116,0.927601,0.712941,-0.252941,1.081765,0.043529,0.003174,0.011661,-249.352941,4.764706,265.235294,-5070,30,5239,1
1,1.039215,0.015083,0.953243,0.229412,0.276471,0.485294,0.013672,-0.005572,0.012164,-260.470588,0.0,263.647059,-5320,-9,5391,1
2,0.911011,0.004525,0.94721,1.170588,0.222941,0.664118,0.011704,0.00372,0.005744,-268.411765,0.0,268.411765,-4829,6,4786,1
3,0.692308,1.156862,0.550528,1.165294,0.698235,-3.421765,0.001321,-0.010311,-0.001522,141.352941,-306.529412,162.0,2670,-5662,2875,1
4,0.529412,1.024133,0.453997,0.808235,0.09,-1.449412,0.006218,-0.010986,-0.006247,155.647059,-314.470588,169.941176,2841,-5824,2967,1


In [14]:
df['is_type_2'] = (df['Type'] == 2).astype(int)
df.drop(columns=['Type'], inplace=True)
df.head(5)

Unnamed: 0,I1,I2,I3,gx,gy,gz,ax,ay,az,V1real,V2real,V3real,N1,N2,N3,is_type_2
0,1.126697,0.021116,0.927601,0.712941,-0.252941,1.081765,0.043529,0.003174,0.011661,-249.352941,4.764706,265.235294,-5070,30,5239,0
1,1.039215,0.015083,0.953243,0.229412,0.276471,0.485294,0.013672,-0.005572,0.012164,-260.470588,0.0,263.647059,-5320,-9,5391,0
2,0.911011,0.004525,0.94721,1.170588,0.222941,0.664118,0.011704,0.00372,0.005744,-268.411765,0.0,268.411765,-4829,6,4786,0
3,0.692308,1.156862,0.550528,1.165294,0.698235,-3.421765,0.001321,-0.010311,-0.001522,141.352941,-306.529412,162.0,2670,-5662,2875,0
4,0.529412,1.024133,0.453997,0.808235,0.09,-1.449412,0.006218,-0.010986,-0.006247,155.647059,-314.470588,169.941176,2841,-5824,2967,0


In [7]:
v1_features = ['I1', 'I2','I3','gx','gy','gz','ax','ay','az', 'V1real', 'V2real','V3real','N1', 'N2', 'N3']
scaler = StandardScaler()
df[v1_features] = scaler.fit_transform(df[v1_features])
joblib.dump(scaler, 'scaler_v1_features.pkl')
print(df.head())

     V1real    V2real  is_type_2
0 -1.323802 -0.278875          0
1 -1.359267 -0.292213          0
2 -1.384598 -0.292213          0
3 -0.077482 -1.150340          0
4 -0.031885 -1.172571          0


In [8]:
print(df.count())

V1real       176
V2real       176
is_type_2    176
dtype: int64


In [9]:
X = df[['V1real', 'V2real']]
y = df['is_type_2']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print("До SMOTE:", y_train.value_counts().to_dict())
print("После SMOTE:", pd.Series(y_resampled).value_counts().to_dict())

До SMOTE: {0: 105, 1: 27}
После SMOTE: {0: 105, 1: 105}


In [12]:
copies_per_row = 5
noise_scale = 0.05

df_resampled = pd.DataFrame(X_resampled, columns=['V1real', 'V2real'])
df_resampled['is_type_2'] = y_resampled

augmented_rows = []

for _, row in df_resampled.iterrows():
    for _ in range(copies_per_row):
        noisy_v1 = row['V1real'] + np.random.normal(0, noise_scale)
        noisy_v2 = row['V2real'] + np.random.normal(0, noise_scale)
        augmented_rows.append([noisy_v1, noisy_v2, row['is_type_2']])

df_augmented = pd.DataFrame(augmented_rows, columns=['V1real', 'V2real', 'is_type_2'])
df_final = pd.concat([df_resampled, df_augmented], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
df_final.to_excel("train_augmented.xlsx", index=False)

print(f"Итоговый размер датасета: {df_final.shape}")
print(df_final['is_type_2'].value_counts())

Итоговый размер датасета: (1260, 3)
is_type_2
0.0    630
1.0    630
Name: count, dtype: int64
