In [1]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

### Датасет

In [2]:
df = pd.read_csv("./features.csv", sep="|")
df.head()

Unnamed: 0,Button.left_dwell,Button.left_interval,Button.left_flight,Button.right_dwell,Button.right_interval,Button.right_flight,Key.esc_dwell,Key.esc_interval,Key.esc_flight,Key.tab_dwell,...,год_dwell_second,год_dwell_third,год_interval_first,год_interval_second,год_flight_first,год_flight_second,год_up_to_up_first,год_up_to_up_second,год_latency,is_insider
0,82.18,472.27,469.34,0.0,0.0,0.0,0.0,0.0,0.0,86.31,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,75.87,479.03,460.12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,94.25,477.76,422.48,0.0,0.0,0.0,0.0,0.0,0.0,70.5,...,60.5,81.0,72.5,133.5,135.0,194.0,133.0,214.5,410.0,0
3,10.33,556.47,514.17,0.0,0.0,0.0,0.0,0.0,0.0,175.98,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,27.35,464.8,492.16,0.0,0.0,0.0,0.0,0.0,0.0,153.42,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [3]:
x = df.loc[:, df.columns != 'is_insider']
y = df['is_insider']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size = 0.3)

### Метрики 

In [4]:
def report(clf, x_train, y_train, x_test, y_test):
    y_pred = clf.fit(x_train, y_train).predict(x_test)
    print("accuracy train:", clf.score(x_train,y_train), "accuracy test", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

### Логистическая регрессия

In [5]:
clf = LogisticRegression(solver='lbfgs', random_state=0).fit(x_train, y_train)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.6363636363636364
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.67      0.67      0.67         6

    accuracy                           0.64        11
   macro avg       0.63      0.63      0.63        11
weighted avg       0.64      0.64      0.64        11



### Naive Bayes Classifier 

In [6]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 0.88 accuracy test 0.6363636363636364
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.67      0.67      0.67         6

    accuracy                           0.64        11
   macro avg       0.63      0.63      0.63        11
weighted avg       0.64      0.64      0.64        11



### Support Vector Classification 

In [7]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.45454545454545453
              precision    recall  f1-score   support

           0       0.45      1.00      0.62         5
           1       0.00      0.00      0.00         6

    accuracy                           0.45        11
   macro avg       0.23      0.50      0.31        11
weighted avg       0.21      0.45      0.28        11



  'precision', 'predicted', average, warn_for)


### Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, n_estimators = 200, random_state=0)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.9090909090909091
              precision    recall  f1-score   support

           0       0.83      1.00      0.91         5
           1       1.00      0.83      0.91         6

    accuracy                           0.91        11
   macro avg       0.92      0.92      0.91        11
weighted avg       0.92      0.91      0.91        11



### AdaBoost 

In [9]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.6363636363636364
              precision    recall  f1-score   support

           0       0.57      0.80      0.67         5
           1       0.75      0.50      0.60         6

    accuracy                           0.64        11
   macro avg       0.66      0.65      0.63        11
weighted avg       0.67      0.64      0.63        11



### Gradient Boosting Classifier 

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=1, random_state=0)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.7272727272727273
              precision    recall  f1-score   support

           0       0.67      0.80      0.73         5
           1       0.80      0.67      0.73         6

    accuracy                           0.73        11
   macro avg       0.73      0.73      0.73        11
weighted avg       0.74      0.73      0.73        11

