In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import joblib


In [3]:
df = pd.read_excel("raw/train.xlsx", decimal=',')
to_delete = ['I1', 'I2','I3','gx','gy','gz','ax','ay','az','V3real','N1','N2','N3']
df.drop(columns=to_delete, inplace=True, errors='ignore')

df.head()

Unnamed: 0,V1real,V2real,Type
0,-249.352941,4.764706,1
1,-260.470588,0.0,1
2,-268.411765,0.0,1
3,141.352941,-306.529412,1
4,155.647059,-314.470588,1


In [4]:
df['is_type_2'] = (df['Type'] == 2).astype(int)
df.drop(columns=['Type'], inplace=True)
print(df.head())

       V1real      V2real  is_type_2
0 -249.352941    4.764706          0
1 -260.470588    0.000000          0
2 -268.411765    0.000000          0
3  141.352941 -306.529412          0
4  155.647059 -314.470588          0


In [7]:
v1_features = ['V1real', 'V2real']
scaler = StandardScaler()
df[v1_features] = scaler.fit_transform(df[v1_features])
joblib.dump(scaler, 'scaler_v1_features.pkl')
print(df.head())

     V1real    V2real  is_type_2
0 -1.323802 -0.278875          0
1 -1.359267 -0.292213          0
2 -1.384598 -0.292213          0
3 -0.077482 -1.150340          0
4 -0.031885 -1.172571          0


In [8]:
print(df.count())

V1real       176
V2real       176
is_type_2    176
dtype: int64


In [9]:
X = df[['V1real', 'V2real']]
y = df['is_type_2']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print("До SMOTE:", y_train.value_counts().to_dict())
print("После SMOTE:", pd.Series(y_resampled).value_counts().to_dict())

До SMOTE: {0: 105, 1: 27}
После SMOTE: {0: 105, 1: 105}


In [12]:
copies_per_row = 5
noise_scale = 0.05

df_resampled = pd.DataFrame(X_resampled, columns=['V1real', 'V2real'])
df_resampled['is_type_2'] = y_resampled

augmented_rows = []

for _, row in df_resampled.iterrows():
    for _ in range(copies_per_row):
        noisy_v1 = row['V1real'] + np.random.normal(0, noise_scale)
        noisy_v2 = row['V2real'] + np.random.normal(0, noise_scale)
        augmented_rows.append([noisy_v1, noisy_v2, row['is_type_2']])

df_augmented = pd.DataFrame(augmented_rows, columns=['V1real', 'V2real', 'is_type_2'])
df_final = pd.concat([df_resampled, df_augmented], ignore_index=True)
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)
df_final.to_excel("train_augmented.xlsx", index=False)

print(f"Итоговый размер датасета: {df_final.shape}")
print(df_final['is_type_2'].value_counts())

Итоговый размер датасета: (1260, 3)
is_type_2
0.0    630
1.0    630
Name: count, dtype: int64
