### Import Modules

In [1]:
from os import listdir
from os.path import isfile,join,splitext
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
#from sklearn.linear_model import SGDClassifier
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import LinearSVC
#from sklearn.neural_network import MLPClassifier
#from sklearn.neighbors import KNeighborsClassifier
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.ensemble import AdaBoostClassifier


### Given code

In [10]:
"""
Script python pour ouvrir les fichiers de traces de clavier

"""

import matplotlib.pyplot as plt
import numpy as np
import time
import scipy as sp
from sklearn.preprocessing import StandardScaler

def read_int(f):
    ba = bytearray(4)
    f.readinto(ba)
    prm = np.frombuffer(ba, dtype=np.int32)
    return prm[0]
    
def read_double(f):
    ba = bytearray(8)
    f.readinto(ba)
    prm = np.frombuffer(ba, dtype=np.double)
    return prm[0]

def read_double_tab(f, n):
    ba = bytearray(8*n)
    nr = f.readinto(ba)
    if nr != len(ba):
        return []
    else:
        prm = np.frombuffer(ba, dtype=np.double)
        return prm
    
def get_pics_from_file(filename):
    f_pic = open(filename, "rb")
    info = dict()
    info["nb_pics"] = read_int(f_pic)
    info["freq_sampling_khz"] = read_double(f_pic)
    info["freq_trame_hz"] = read_double(f_pic)
    info["freq_pic_khz"] = read_double(f_pic)
    info["norm_fact"] = read_double(f_pic)
    tab_pics = []
    pics = read_double_tab(f_pic, info["nb_pics"])
    nb_trames = 1
    while len(pics) > 0:
        nb_trames = nb_trames+1
        tab_pics.append(pics)
        pics = read_double_tab(f_pic, info["nb_pics"])
    f_pic.close()
    return tab_pics, info

### Getting files and pre-processing

In [11]:
fichiers = [f for f in listdir("data/") if isfile(join("data/", f))]

In [12]:
char_mapping = []
data = []
cpt = 0
for file in fichiers:
    char_mapping.append((splitext(file)[0])[5:])
    d, _ = get_pics_from_file("data/" + file)
    for i in range(len(d)):
        d[i] = np.append(d[i], cpt)
    cpt += 1
    data.append(d)
    
print(char_mapping)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'CTRL', 'D', 'E', 'ENTER', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'NOKEY', 'O', 'P', 'Q', 'R', 'S', 'SHIFT', 'SPACE', 'SUPPR', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']


In [13]:
for t in range(len(data)):
    data[t] = data[t][0:6963]
    
data = np.array(data)
print(data.shape)

(42, 6963, 18)


In [14]:
data = data.reshape(42*6963, 18)
d = pd.DataFrame(data)
d.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.968018,1.348877,0.230713,0.941162,0.811157,0.838623,0.671997,0.506592,0.765991,0.2771,0.477295,0.569458,0.371704,0.802002,0.634155,0.509033,0.567017,0.0
1,0.754395,1.149902,0.137939,1.062012,0.5896,0.745239,0.734253,0.904541,0.669556,0.085449,0.650635,0.791626,0.339355,0.53833,0.848999,0.687256,0.356445,0.0
2,0.765991,1.292725,0.22583,1.033325,0.750122,0.550537,0.760498,0.45166,0.767822,0.393066,0.463867,0.661011,0.431519,0.609741,0.845947,0.637207,0.558472,0.0
3,0.57373,0.880127,1.459351,0.375977,1.184692,0.697021,0.614014,0.910645,1.012573,0.670776,0.264282,0.765991,0.631714,0.355835,0.656128,1.054688,0.786133,0.0
4,0.428467,0.922241,1.361694,0.296631,1.135254,0.856934,0.629883,0.707397,0.797729,0.759888,0.233154,0.765991,0.765991,0.623169,0.587158,0.908813,0.946045,0.0


In [15]:
data.shape

(292446, 18)

### Split data set into a training set and a test one

In [16]:
X_train, X_test, y_train, y_test = train_test_split(data[:, 0:17], data[:, 17], test_size=0.33)

In [17]:
# Way of defining a good combinaisons of params for the EXTRATREECLASSIFIER algo.
'''
params = [{'min_samples_leaf': range(2,20,5),
          'min_samples_split': range(2,20,5),
         'criterion': ('gini', 'entropy')}]
clf = GridSearchCV(ExtraTreesClassifier(n_estimators=100), params)
clf.fit(X_train, y_train)
print(pd.DataFrame.from_dict(clf.cv_results_).loc[:,["params","mean_test_score"]])
clf.best_estimator_
print(clf.best_score_, clf.best_params_)
'''

'\nparams = [{\'min_samples_leaf\': range(2,20,5),\n          \'min_samples_split\': range(2,20,5),\n         \'criterion\': (\'gini\', \'entropy\')}]\nclf = GridSearchCV(ExtraTreesClassifier(n_estimators=100), params)\nclf.fit(X_train, y_train)\nprint(pd.DataFrame.from_dict(clf.cv_results_).loc[:,["params","mean_test_score"]])\nclf.best_estimator_\nprint(clf.best_score_, clf.best_params_)\n'

### Training of the model.

In [18]:
# Decrease n_estimators if no sufficient memory.
# The more n_estimators is high, the more the IA will be accurate.
clf = ExtraTreesClassifier(n_estimators=120)
clf.fit(X_train, y_train)
res = clf.predict(X_test)
accuracy_score(y_test, res)

0.55049322335972151

### Detection of login/password

In [21]:
def keylogger(): 
    frames, info = get_pics_from_file('pics_LOGINMDP.bin')
    histo = [0 for j in range(42)]
    result = []
    res = clf.predict(frames)
    res2 = []
    index = 0
    for c in res:
        index += 1
        histo[int(c)] += 1
        if index % 50 == 0:
            res2.append(char_mapping[np.argmax(histo)])
            histo = [0 for j in range(42)]
    return res2

keylogger()

['NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'CTRL',
 'CTRL',
 'CTRL',
 'CTRL',
 'CTRL',
 'CTRL',
 'CTRL',
 'SUPPR',
 'SUPPR',
 'SUPPR',
 'SUPPR',
 'SUPPR',
 'SUPPR',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'NOKEY',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT',
 'SHIFT