In [1]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

### Датасет

In [2]:
df = pd.read_csv("./features.csv", sep="|")
df.head()

Unnamed: 0,Button.left_dwell,Button.left_interval,Button.left_flight,Key.tab,Key.tab_dwell,Key.tab.1,Key.tab_interval,Key.tab.2,Key.tab_flight,Key.shift,...,Key.enter_interval,Key.enter.2,Key.enter_flight,Key.backspace,Key.backspace_dwell,Key.backspace.1,Key.backspace_interval,Key.backspace.2,Key.backspace_flight,is_insider
0,82.18,472.27,469.34,16,86.31,16,478.56,16,502.38,215,...,448.26,50,479.96,373,65.12,373,190.68,373,247.75,0
1,75.87,479.03,460.12,0,0.0,0,0.0,0,0.0,59,...,10.0,2,52.0,74,67.15,74,203.36,74,243.49,0
2,94.25,477.76,422.48,2,70.5,2,414.5,2,485.0,79,...,405.86,36,446.53,139,65.89,139,174.32,139,225.83,0
3,10.33,556.47,514.17,165,175.98,165,477.89,165,496.3,131,...,417.72,50,368.06,221,139.16,221,290.35,221,393.31,0
4,27.35,464.8,492.16,12,153.42,12,510.17,12,413.58,16,...,454.0,2,496.0,4,73.25,4,360.0,4,433.25,0


In [3]:
x = df.loc[:, df.columns != 'is_insider']
y = df['is_insider']
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 0, test_size = 0.25)

### Метрики 

In [4]:
def report(clf, x_train, y_train, x_test, y_test):
    y_pred = clf.fit(x_train, y_train).predict(x_test)
    print("accuracy train:", clf.score(x_train,y_train), "accuracy test", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

### Логистическая регрессия

In [5]:
clf = LogisticRegression(solver='lbfgs', random_state=0).fit(x_train, y_train)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.6363636363636364
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.67      0.67      0.67         6

    accuracy                           0.64        11
   macro avg       0.63      0.63      0.63        11
weighted avg       0.64      0.64      0.64        11





### Naive Bayes Classifier 

In [6]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 0.96 accuracy test 0.6363636363636364
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.67      0.67      0.67         6

    accuracy                           0.64        11
   macro avg       0.63      0.63      0.63        11
weighted avg       0.64      0.64      0.64        11



### Support Vector Classification 

In [7]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.45454545454545453
              precision    recall  f1-score   support

           0       0.45      1.00      0.62         5
           1       0.00      0.00      0.00         6

    accuracy                           0.45        11
   macro avg       0.23      0.50      0.31        11
weighted avg       0.21      0.45      0.28        11



  'precision', 'predicted', average, warn_for)


### Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=2, n_estimators = 200, random_state=0)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 0.96 accuracy test 0.8181818181818182
              precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.75      1.00      0.86         6

    accuracy                           0.82        11
   macro avg       0.88      0.80      0.80        11
weighted avg       0.86      0.82      0.81        11



### AdaBoost 

In [9]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.7272727272727273
              precision    recall  f1-score   support

           0       0.75      0.60      0.67         5
           1       0.71      0.83      0.77         6

    accuracy                           0.73        11
   macro avg       0.73      0.72      0.72        11
weighted avg       0.73      0.73      0.72        11



### Gradient Boosting Classifier 

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=1, random_state=0)
report(clf, x_train, y_train, x_test, y_test)

accuracy train: 1.0 accuracy test 0.6363636363636364
              precision    recall  f1-score   support

           0       0.60      0.60      0.60         5
           1       0.67      0.67      0.67         6

    accuracy                           0.64        11
   macro avg       0.63      0.63      0.63        11
weighted avg       0.64      0.64      0.64        11

