In [1]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

### Датасет

In [2]:
df = pd.read_csv("./features.csv", sep="|")
df.head()

Unnamed: 0,Button.left_dwell,Button.left_interval,Button.left_flight,Key.tab,Key.tab_dwell,Key.tab.1,Key.tab_interval,Key.tab.2,Key.tab_flight,Key.shift,...,тор_dwell_second,тор_dwell_third,тор_interval_first,тор_interval_second,тор_flight_first,тор_flight_second,тор_up_to_up_first,тор_up_to_up_second,тор_latency,is_insider
0,82.18,472.27,469.34,16,86.31,16,478.56,16,502.38,215,...,115.0,110.75,372.5,563.25,233.25,178.25,237.5,174.0,522.25,0
1,75.87,479.03,460.12,0,0.0,0,0.0,0,0.0,59,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,94.25,477.76,422.48,2,70.5,2,414.5,2,485.0,79,...,149.0,92.0,984.0,975.0,120.0,124.0,133.0,67.0,336.0,0
3,10.33,556.47,514.17,165,175.98,165,477.89,165,496.3,131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,27.35,464.8,492.16,12,153.42,12,510.17,12,413.58,16,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [3]:
x = df.loc[:, df.columns != 'is_insider']
y = df['is_insider']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size = 0.25)

### Метрики 

In [4]:
def report(clf, x_train, y_train, x_test, y_test):
    y_pred = clf.fit(x_train, y_train).predict(x_test)
    print("accuracy train:", clf.score(x_train,y_train), "accuracy test", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

### Логистическая регрессия

In [5]:
clf = LogisticRegression(solver='lbfgs', random_state=0).fit(x_train, y_train)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.9090909090909091
              precision    recall  f1-score   support

           0       1.00      0.67      0.80         3
           1       0.89      1.00      0.94         8

    accuracy                           0.91        11
   macro avg       0.94      0.83      0.87        11
weighted avg       0.92      0.91      0.90        11



### Naive Bayes Classifier 

In [6]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 0.9354838709677419 accuracy test 0.7272727272727273
              precision    recall  f1-score   support

           0       0.50      0.33      0.40         3
           1       0.78      0.88      0.82         8

    accuracy                           0.73        11
   macro avg       0.64      0.60      0.61        11
weighted avg       0.70      0.73      0.71        11



### Support Vector Classification 

In [7]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.2727272727272727
              precision    recall  f1-score   support

           0       0.27      1.00      0.43         3
           1       0.00      0.00      0.00         8

    accuracy                           0.27        11
   macro avg       0.14      0.50      0.21        11
weighted avg       0.07      0.27      0.12        11



  'precision', 'predicted', average, warn_for)


### Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, n_estimators = 200, random_state=0)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.8181818181818182
              precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.80      1.00      0.89         8

    accuracy                           0.82        11
   macro avg       0.90      0.67      0.69        11
weighted avg       0.85      0.82      0.78        11



### AdaBoost 

In [9]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.8181818181818182
              precision    recall  f1-score   support

           0       0.67      0.67      0.67         3
           1       0.88      0.88      0.88         8

    accuracy                           0.82        11
   macro avg       0.77      0.77      0.77        11
weighted avg       0.82      0.82      0.82        11



### Gradient Boosting Classifier 

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=1, random_state=0)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.6363636363636364
              precision    recall  f1-score   support

           0       0.33      0.33      0.33         3
           1       0.75      0.75      0.75         8

    accuracy                           0.64        11
   macro avg       0.54      0.54      0.54        11
weighted avg       0.64      0.64      0.64        11

