In [467]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

## Dataset overview

In [468]:
columns = ['seismic', 'seismoacoustic', 'shift', 'genergy', 'gpuls', 'gdenergy', 'gdpuls', 'ghazard', 'nbumps', 'nbumps2',
           'nbumps3', 'nbumps4', 'nbumps5', 'nbumps6', 'nbumps7', 'nbumps89', 'energy', 'maxenergy', 'class']
data = pd.read_csv('seismic-bumps.arff', names=columns)
data.head()

Unnamed: 0,seismic,seismoacoustic,shift,genergy,gpuls,gdenergy,gdpuls,ghazard,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,nbumps7,nbumps89,energy,maxenergy,class
0,a,a,N,15180,48,-72,-72,a,0,0,0,0,0,0,0,0,0,0,0
1,a,a,N,14720,33,-70,-79,a,1,0,1,0,0,0,0,0,2000,2000,0
2,a,a,N,8050,30,-81,-78,a,0,0,0,0,0,0,0,0,0,0,0
3,a,a,N,28820,171,-23,40,a,1,0,1,0,0,0,0,0,3000,3000,0
4,a,a,N,12640,57,-63,-52,a,0,0,0,0,0,0,0,0,0,0,0


## Dataset preprocessing

In [469]:
# Checking for NaN values
data.isna().sum().sum()

0

In [470]:
# Checking data types correctness
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2584 entries, 0 to 2583
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   seismic         2584 non-null   object
 1   seismoacoustic  2584 non-null   object
 2   shift           2584 non-null   object
 3   genergy         2584 non-null   int64 
 4   gpuls           2584 non-null   int64 
 5   gdenergy        2584 non-null   int64 
 6   gdpuls          2584 non-null   int64 
 7   ghazard         2584 non-null   object
 8   nbumps          2584 non-null   int64 
 9   nbumps2         2584 non-null   int64 
 10  nbumps3         2584 non-null   int64 
 11  nbumps4         2584 non-null   int64 
 12  nbumps5         2584 non-null   int64 
 13  nbumps6         2584 non-null   int64 
 14  nbumps7         2584 non-null   int64 
 15  nbumps89        2584 non-null   int64 
 16  energy          2584 non-null   int64 
 17  maxenergy       2584 non-null   int64 
 18  class   

In [471]:
# Checking for duplicated rows
data.duplicated().sum()

6

In [472]:
# Deleting duplicated rows
data = data.drop_duplicates()

In [473]:
def form_test_dataset(data: pd.DataFrame, test_percent: float) -> pd.DataFrame:
    test_slice = int(len(data) * test_percent)
    
    data_class_1 = data[data['class'] == 1].sample(frac=1)
    data_class_0 = data[data['class'] == 0].sample(frac=1)
    
    test_samples = pd.concat([data_class_1[: test_slice], data_class_0[: test_slice]], axis=0).reset_index(drop=True).sample(frac=1)
    validation_samples = pd.concat([data_class_1[test_slice:], data_class_0[test_slice:]], axis=0).reset_index(drop=True).sample(frac=1)
    
    return test_samples, validation_samples

In [474]:
test_samples, validation_samples = form_test_dataset(data, 0.005)
print(f'Количество тестовых наблюдений: {len(test_samples)}')

Количество тестовых наблюдений: 24


In [475]:
def count_sample_score(test_samples: pd.DataFrame, validation_samples: pd.DataFrame, epsilon: float, sample_id: int):
    test_samples_class = test_samples['class'].values.tolist()
    test_samples = test_samples.iloc[:, :-1]

    temp_concat = pd.concat([test_samples, validation_samples], axis=0).reset_index(drop=True).reset_index()
    true_concat = temp_concat.copy()
    true_concat['score'] = 0
    
    scale_columns = ['genergy', 'gpuls', 'gdenergy', 'gdpuls', 'nbumps', 'nbumps2','nbumps3', 'nbumps4', 
                      'nbumps5', 'nbumps6', 'nbumps7', 'nbumps89', 'energy', 'maxenergy']
    scaler = MinMaxScaler()
    temp_concat[scale_columns] = scaler.fit_transform(temp_concat[scale_columns])
    
    temp_concat['score'] = 0
    one_sample = temp_concat.loc[temp_concat['index'] == sample_id]
    
    text_indexes = [1, 2, 3, 8]
    for i, row in temp_concat.iloc[len(test_samples):, :].iterrows():
        validation_set = [row[x] for x in range(1,20)]
        score = 0
        for j, (a,b) in enumerate(zip(one_sample.values[0][1:], validation_set), 1):
            if j in text_indexes:
                score += 1 if a == b else 0
                continue
            if abs(a-b) < epsilon:
                score += 1
        temp_concat.loc[i, 'score'] = score
    
    a = temp_concat.loc[temp_concat['index'] == sample_id, :]
    
    b = temp_concat.iloc[len(test_samples):, :].sort_values('score', ascending=False)
    b = b.iloc[:20]
    
    score_df = pd.concat([a,b], axis=0)
    score_df.set_index('index', inplace=True)
    scores = score_df['score'].values.tolist()
    
    for i in score_df.index:
        score_df.loc[i] = true_concat.loc[i]
        
    score_df['score'] = scores
    
    scale_columns.append('class')
    score_df.loc[sample_id, 'class'] = test_samples_class[sample_id]
    score_df[scale_columns] = score_df[scale_columns].astype(int)
    score_df = score_df.reset_index(drop=True)
    
    return score_df

In [476]:
# Введите номер наблюдения в переменную sample_id. На выходе первая строчка это само наблюдение, а 20 следующих - близкие к рассматриваемому наблюдению записи
# Параметр epsilon так же можно настраивать

sample_id = 15
a = count_sample_score(test_samples, validation_samples, epsilon=0.01, sample_id=sample_id)
a

Unnamed: 0,seismic,seismoacoustic,shift,genergy,gpuls,gdenergy,gdpuls,ghazard,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,nbumps7,nbumps89,energy,maxenergy,class,score
0,b,b,W,93280,747,273,49,a,3,0,2,1,0,0,0,0,16000,10000,0,0
1,b,b,W,105810,958,-27,-26,a,3,1,2,0,0,0,0,0,14900,8000,0,13
2,b,b,W,46430,737,3,46,a,4,2,2,0,0,0,0,0,15000,9000,0,13
3,b,b,W,886800,2519,-29,44,a,3,0,2,1,0,0,0,0,49000,40000,0,13
4,a,b,W,48250,573,9,44,a,3,1,2,0,0,0,0,0,12600,8000,0,12
5,b,a,W,92450,777,15,24,a,4,1,2,1,0,0,0,0,27600,10000,0,12
6,b,b,W,439210,2259,-21,-2,a,6,3,2,1,0,0,0,0,18000,10000,0,12
7,b,a,W,65490,735,60,72,a,3,0,2,1,0,0,0,0,72000,60000,0,12
8,b,a,W,68920,1004,-59,-51,a,3,0,2,1,0,0,0,0,66000,60000,0,12
9,a,c,W,83180,744,-32,-19,a,3,1,2,0,0,0,0,0,14600,9000,1,12
