In [244]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [245]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Fáza 2 - Predspracovanie údajov

## 2.1 Realizácia predspracovania dát

### A - Rozdelenia dát na trénovaciu a testovaciu množinu

#### zlúčime potrebné dáta do jedného datasetu 

In [246]:
# loading datasets
df_processes    = pd.read_csv("processes.csv", sep='\t')
df_connections  = pd.read_csv("connections.csv", sep='\t')

#### vyberieme si dáta, ktoré sme identifikovali v EDA

In [247]:
df_processes    = df_processes[["imei", "ts", "mwra", "p.android.gm", "p.android.documentsui", "p.system", "p.android.externalstorage", "p.android.settings", "p.android.chrome"]]
df_connections  = df_connections[["imei", "ts", "mwra", "c.katana", "c.dogalize", "c.android.chrome", "c.android.gm"]]

#### spojíme ich pomocou "ts" a "imei"

In [248]:
df = df_connections.merge(df_processes, on=['imei', 'ts'], how='outer')

#### Rozdelíme dáta v pomere 80:20 (trénovacia/testovacia množina). Tento pomer vychádza z predpokladu, že pri dostatočne veľkom množstve dát, máme pri rozdelení dostatok dát na trénovanie a efektívne učenie modelu, zatiaľ čo testovacia množina bude dostatočne veľká pre overenie modelu. Tento pomer je bežný a vhodný pre väčšinu úloh strojového učenia.

In [249]:
TRAIN = 0.8

train_df, test_df = train_test_split(df, train_size=TRAIN, random_state=42)

train_df.head()

Unnamed: 0,imei,ts,mwra_x,c.katana,c.dogalize,c.android.chrome,c.android.gm,mwra_y,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
4229,863033069630348354,2018-05-07 06:23:00,1.0,8.78526,12.97475,11.62351,14.41864,1.0,9.23972,10.43492,13.82551,9.86109,8.95921,11.39909
12572,8630330696303481545,2018-05-11 14:57:00,1.0,11.81607,14.66263,10.73155,14.21554,1.0,10.37842,10.4245,14.23429,14.15057,9.07541,11.8781
12930,8630330696303481669,2018-05-06 20:06:00,0.0,12.49268,13.38094,10.93373,14.18849,0.0,11.95766,6.4704,12.71239,9.86456,7.39561,10.2285
14497,8630330696303482196,2018-05-10 16:15:00,1.0,12.27976,13.40483,9.23485,15.58463,1.0,8.52524,13.52572,13.66325,10.38706,9.61267,9.97641
1347,359043379931766437,2018-05-14 23:22:00,0.0,13.1706,14.3461,9.63767,6.44279,0.0,12.3716,12.12072,13.74927,11.34104,9.44807,5.71597


#### ďalej pracujeme s trénovacím datasetom

## B - Transformujeme dáta na vhodný formát pre ML

In [250]:
# better info()
def analyze_dataframe(the_df):
    table = pd.DataFrame({
        'Missing Values': the_df.isnull().sum(),
        'Data Type': the_df.dtypes
    })

    rows, columns = the_df.shape
    duplicates = the_df.duplicated().sum()

    print(f"Shape: {rows} rows, {columns} columns")
    print(f"Duplicates: {duplicates}")

    display(table)
    
analyze_dataframe(train_df)

Shape: 12331 rows, 14 columns
Duplicates: 378


Unnamed: 0,Missing Values,Data Type
imei,0,int64
ts,0,object
mwra_x,0,float64
c.katana,0,float64
c.dogalize,0,float64
c.android.chrome,0,float64
c.android.gm,0,float64
mwra_y,0,float64
p.android.gm,0,float64
p.android.documentsui,0,float64


In [251]:
matches = (train_df['mwra_x'] == train_df['mwra_y']).sum()
not_matches = len(train_df) - matches

print(f"Matched: {matches}, Not matched: {not_matches}")

Matched: 12331, Not matched: 0


#### môžeme vymazať dáta, pomocou ktorých sme spájali datasety do jedného, t.j. nepotrebné stĺpce: ts, mrwa_y + premenujeme mwra_x na mwra

In [252]:
train_df = train_df.drop(columns=['ts', 'mwra_y'], errors='ignore')
train_df = train_df.rename(columns={'mwra_x': 'mwra'})

train_df.head()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
4229,863033069630348354,1.0,8.78526,12.97475,11.62351,14.41864,9.23972,10.43492,13.82551,9.86109,8.95921,11.39909
12572,8630330696303481545,1.0,11.81607,14.66263,10.73155,14.21554,10.37842,10.4245,14.23429,14.15057,9.07541,11.8781
12930,8630330696303481669,0.0,12.49268,13.38094,10.93373,14.18849,11.95766,6.4704,12.71239,9.86456,7.39561,10.2285
14497,8630330696303482196,1.0,12.27976,13.40483,9.23485,15.58463,8.52524,13.52572,13.66325,10.38706,9.61267,9.97641
1347,359043379931766437,0.0,13.1706,14.3461,9.63767,6.44279,12.3716,12.12072,13.74927,11.34104,9.44807,5.71597


In [253]:
train_df.describe()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
count,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0
mean,3.879585e+18,0.625497,10.016101,11.181063,11.023363,10.554946,9.983216,9.391937,11.163586,11.713024,9.893613,9.894593
std,3.327328e+18,0.484014,2.575195,2.654335,2.710526,2.719858,2.76354,2.365531,2.310111,2.393699,2.484082,2.569458
min,3.590434e+17,0.0,1.38479,1.80246,1.26488,1.73024,0.0,2.5514,3.05045,3.46351,2.01049,0.87927
25%,8.630331e+17,0.0,8.142165,9.318645,9.125685,8.56491,8.058725,7.78591,9.436415,10.088205,8.05881,8.04182
50%,3.590434e+18,1.0,9.86091,11.28809,11.07872,10.43777,9.87305,9.19426,11.32955,11.8158,9.81752,9.92211
75%,8.630331e+18,1.0,11.811615,13.05715,12.950765,12.51171,11.911765,10.726015,12.90069,13.3964,11.804595,11.82047
max,8.630331e+18,1.0,19.06465,21.52206,20.71065,20.88365,20.56361,20.22066,18.99915,20.06016,18.16209,17.91057


In [254]:
def remove_outlier(the_df, the_column):
    if the_column in ['imei', 'mwra']:
        return the_df
    
    lower_bound = the_df[the_column].quantile(0.05)
    upper_bound = the_df[the_column].quantile(0.95)

    outlier_mask = (the_df[the_column] > upper_bound) | (the_df[the_column] < lower_bound)

    the_df.loc[outlier_mask, the_column] = np.where(
        the_df.loc[outlier_mask, the_column] < lower_bound,
        lower_bound,
        upper_bound
    )
    return the_df

for column in train_df.columns:
    train_df = remove_outlier(train_df, column)

train_df.describe()


Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
count,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0
mean,3.879585e+18,0.625497,10.013604,11.181681,11.02457,10.552272,9.992341,9.3676,11.16249,11.71367,9.899836,9.902847
std,3.327328e+18,0.484014,2.384562,2.459327,2.493326,2.518698,2.538359,2.130225,2.161509,2.216309,2.326487,2.40104
min,3.590434e+17,0.0,6.027955,6.67659,6.556425,6.30733,5.58809,5.86954,7.26438,7.619215,5.901755,5.647485
25%,8.630331e+17,0.0,8.142165,9.318645,9.125685,8.56491,8.058725,7.78591,9.436415,10.088205,8.05881,8.04182
50%,3.590434e+18,1.0,9.86091,11.28809,11.07872,10.43777,9.87305,9.19426,11.32955,11.8158,9.81752,9.92211
75%,8.630331e+18,1.0,11.811615,13.05715,12.950765,12.51171,11.911765,10.726015,12.90069,13.3964,11.804595,11.82047
max,8.630331e+18,1.0,14.45873,15.382915,15.342235,15.10848,14.61765,13.772555,14.6901,15.49458,13.88949,13.9629


#### aplikovali sme vymazanie odľahlích hodnôt na všetky stĺpce v trénovacej množine, pričom sme použili percentyl 5 a 95, z dôvodu zachovania normálnych hodnôt, no vyradzujeme extrémne hodnoty na oboch stranách

In [255]:
def handle_missing_values(the_df):
    for column in the_df.columns:
        if the_df[column].isnull().sum() > 0:
            median_value = the_df[column].median()
            the_df[column].fillna(median_value, inplace=True)
            # df.dropna(inplace=True)
    return the_df

#### vytvorili sme funkciu na nahradenie missing values pre každý riadok

In [256]:
def normalize_data(the_df):    
    scaler = StandardScaler()
    numeric_columns = the_df.select_dtypes(include=['float64', 'int64']).columns
    the_df[numeric_columns] = scaler.fit_transform(the_df[numeric_columns])
    return the_df

#### takisto pre normalizáciu dát

In [257]:
def remove_duplicates(the_df):
    the_df = the_df.drop_duplicates()
    return the_df

#### a aj na odstránenie duplikátov, ktoré potom aplikujeme na trénovaciu množinu

In [258]:
train_df    = handle_missing_values(train_df)
train_df    = normalize_data(train_df)
train_df    = remove_duplicates(train_df)

analyze_dataframe(train_df)

Shape: 11953 rows, 12 columns
Duplicates: 0


Unnamed: 0,Missing Values,Data Type
imei,0,float64
mwra,0,float64
c.katana,0,float64
c.dogalize,0,float64
c.android.chrome,0,float64
c.android.gm,0,float64
p.android.gm,0,float64
p.android.documentsui,0,float64
p.system,0,float64
p.android.externalstorage,0,float64


## Pokračovanie Transformácie dát C

### Transformation

#### Power Transformation

In [259]:
from sklearn.preprocessing import PowerTransformer

power_transformer = PowerTransformer(method='yeo-johnson')  # Use 'box-cox' if your data is strictly positive
train_df_power_transformed = power_transformer.fit_transform(train_df)

# Convert transformed data back to DataFrame
train_df_power_transformed_df = pd.DataFrame(train_df_power_transformed, columns=train_df.columns)

In [260]:
train_df_power_transformed_df.head()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
0,-0.954377,0.77579,-0.482436,0.712906,0.225964,1.493153,-0.272124,0.567127,1.266232,-0.84331,-0.3955,0.609592
1,1.321544,0.77579,0.777242,1.449533,-0.132051,1.419115,0.179337,0.562652,1.481076,1.113335,-0.345003,0.813172
2,1.321544,-1.289009,1.037235,0.887975,-0.051382,1.409237,0.783372,-1.467547,0.695241,-0.841839,-1.078798,0.116642
3,1.321544,0.77579,0.956008,0.898319,-0.721982,1.743,-0.562348,1.77801,1.181675,-0.618857,-0.112153,0.011499
4,-1.198231,-1.289009,1.292617,1.30966,-0.564303,-1.692595,0.93854,1.251883,1.22645,-0.202346,-0.183376,-1.718902


In [261]:
train_df_power_transformed_df.describe()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
count,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0
mean,4.755578e-18,-2.044898e-16,-2.14001e-17,-3.566683e-18,-2.199455e-17,2.5561230000000003e-17,2.793902e-17,-2.4372340000000003e-17,-2.199455e-17,-3.418071e-17,1.2780610000000001e-17,2.585845e-17
std,1.000042,1.000042,1.000042,1.000042,1.000042,1.000042,1.000042,1.000042,1.000042,1.000042,1.000042,1.000042
min,-1.198231,-1.289009,-1.768827,-1.769685,-1.755643,-1.751712,-1.794814,-1.824993,-1.716732,-1.77041,-1.736851,-1.746227
25%,-0.9543775,-1.289009,-0.7716488,-0.7732359,-0.7701543,-0.7819825,-0.7523285,-0.7214916,-0.8179971,-0.7565417,-0.7862342,-0.784797
50%,0.1445602,0.7757895,-0.0124549,0.01016486,0.005133686,-0.01033706,-0.01933947,0.009882102,0.02847897,0.006395047,-0.02672437,-0.008024477
75%,1.321544,0.7757895,0.7756376,0.7518943,0.7691216,0.7928679,0.7670034,0.6912314,0.7885644,0.7487969,0.8156643,0.795191
max,1.321544,0.7757895,1.766184,1.77044,1.760508,1.743,1.763893,1.867281,1.723541,1.777698,1.70745,1.708762


#### Quantile Transformation

In [270]:
from sklearn.preprocessing import QuantileTransformer

quantile_transformer = QuantileTransformer(output_distribution='normal')  
train_df_quantile_transformed = quantile_transformer.fit_transform(train_df)

train_df_quantile_transformed_df = pd.DataFrame(train_df_quantile_transformed, columns=train_df.columns)

In [271]:
train_df_quantile_transformed_df.head()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
0,-0.628724,5.199338,-0.423745,0.628426,0.18632,1.369255,-0.230782,0.548925,1.148812,-0.741406,-0.319263,0.504864
1,5.199338,5.199338,0.672391,1.321571,-0.11344,1.290107,0.159727,0.544835,1.379133,1.001526,-0.272284,0.695598
2,5.199338,-5.199338,0.905891,0.799586,-0.05128,1.280525,0.685308,-1.334413,0.589901,-0.740299,-0.962143,0.112346
3,5.199338,5.199338,0.834811,0.80945,-0.621116,5.199338,-0.507564,1.574159,1.064151,-0.546547,-0.069932,0.022632
4,-5.199338,-5.199338,1.145292,1.195074,-0.482463,-1.596712,0.817375,1.139058,1.107254,-0.194993,-0.129597,-1.619385


In [272]:
train_df_quantile_transformed_df.describe()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
count,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0
mean,0.426276,1.292331,-0.001532,0.001263,0.00218,0.003451,-0.003228,-0.003682,0.002458,-0.000157,-0.002424,0.011499
std,3.660016,5.036379,1.803583,1.803177,1.809212,1.799671,1.804043,1.803652,1.807301,1.809757,1.802536,1.813266
min,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338,-5.199338
25%,-0.628724,-5.199338,-0.677946,-0.666074,-0.669562,-0.669571,-0.687346,-0.664474,-0.679128,-0.673717,-0.674804,-0.667364
50%,0.099274,5.199338,-0.005634,0.004196,0.001544,-0.002197,-0.007156,0.003801,0.001806,0.001488,0.00021,0.008411
75%,5.199338,5.199338,0.670508,0.669427,0.674804,0.667624,0.671099,0.670689,0.670818,0.676381,0.675098,0.681568
max,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338,5.199338


### Scaling

#### Standard Scaling

In [262]:
import numpy as np

scaler_standard = StandardScaler()
train_df_standard_scaled = scaler_standard.fit_transform(train_df)

# Shift the data to make all values non-negative
train_df_standard_scaled_shifted = train_df_standard_scaled - np.min(train_df_standard_scaled)

print("Shifted Standard Scaled Data:")
print(pd.DataFrame(train_df_standard_scaled_shifted, columns=train_df.columns).head())


Shifted Standard Scaled Data:
       imei      mwra  c.katana  c.dogalize  c.android.chrome  c.android.gm  \
0  0.936006  2.618623  1.327342    2.568217          2.084049      3.375251   
1  3.269443  2.618623  2.598707    3.254710          1.727084      3.294606   
2  3.269443  0.553825  2.882532    2.733422          1.807997      3.283865   
3  3.269443  2.618623  2.793216    2.743139          1.128099      3.649168   
4  0.784599  0.553825  3.166906    3.125971          1.289309      0.208261   

   p.android.gm  p.android.documentsui  p.system  p.android.externalstorage  \
0      1.544718               2.342561  3.073149                   1.010128   
1      1.993280               2.337669  3.262571                   2.942852   
2      2.615381               0.481100  2.557346                   1.011692   
3      1.263267               3.793785  2.997960                   1.247116   
4      2.778442               3.134095  3.037820                   1.676954   

   p.android.setting

In [263]:
train_df_standard_scaled_shifted = pd.DataFrame(train_df_standard_scaled_shifted, columns=train_df.columns)
train_df.describe()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
count,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0
mean,0.000601,-0.002517,0.000226,0.003892,-0.001521,0.002811,0.00164,0.001412,0.003683,-0.00202,-0.002872,0.004383
std,1.000493,1.000691,0.999805,0.999827,1.002249,0.999977,1.000162,0.999878,0.998474,1.001475,0.997091,1.002022
min,-1.058112,-1.292364,-1.671506,-1.831913,-1.792114,-1.68544,-1.735148,-1.642175,-1.803494,-1.847496,-1.718575,-1.772372
25%,-0.906636,-1.292364,-0.783678,-0.755599,-0.767335,-0.788051,-0.759995,-0.74144,-0.794209,-0.743771,-0.789448,-0.776919
50%,-0.086906,0.773776,-0.063009,0.048454,0.020207,-0.041854,-0.047194,-0.075375,0.084315,0.041828,-0.038427,0.011097
75%,1.427854,0.773776,0.754188,0.766029,0.774407,0.783201,0.757146,0.638245,0.802643,0.759185,0.807888,0.805114
max,1.427854,0.773776,1.864203,1.708355,1.731759,1.809027,1.822239,2.06792,1.632079,1.706018,1.714953,1.691025


#### Min-Max Scaling

In [264]:
from sklearn.preprocessing import MinMaxScaler

scaler_minmax = MinMaxScaler(feature_range=(0,5))
train_df_minmax = scaler_minmax.fit_transform(train_df)

print("\nMin-Max Scaled Data:")
print(pd.DataFrame(train_df_minmax, columns=train_df.columns).head())


Min-Max Scaled Data:
           imei  mwra  c.katana  c.dogalize  c.android.chrome  c.android.gm  \
0  3.046622e-01   5.0  1.635262    3.617003          2.883675      4.608097   
1  5.000000e+00   5.0  3.432730    4.586344          2.376062      4.492714   
2  5.000000e+00   0.0  3.834004    3.850276          2.491122      4.477347   
3  5.000000e+00   5.0  3.707729    3.863995          1.524290      5.000000   
4  4.440892e-16   0.0  4.236055    4.404562          1.753535      0.076956   

   p.android.gm  p.android.documentsui  p.system  p.android.externalstorage  \
0      2.022042               2.888379  4.417841                   1.423347   
1      2.652582               2.881786  4.693087                   4.146700   
2      3.527066               0.380146  3.668338                   1.425550   
3      1.626408               4.843835  4.308586                   1.757280   
4      3.756279               3.954934  4.366506                   2.362954   

   p.android.settings  p.and

In [265]:
train_df_minmax_scaled = pd.DataFrame(train_df_minmax, columns=train_df.columns)
train_df_minmax_scaled.describe()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
count,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0,11953.0
mean,2.12938,3.121392,2.364069,2.592748,2.540661,2.415605,2.441101,2.21502,2.630095,2.596692,2.498454,2.565046
std,2.012283,2.421645,1.413868,1.412078,1.422085,1.4308,1.405753,1.34751,1.453141,1.409134,1.451992,1.446589
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.304662,0.0,1.255516,1.520102,1.454052,1.284014,1.370604,1.213897,1.468874,1.553004,1.353021,1.437105
50%,1.953378,5.0,2.274646,2.655684,2.57149,2.351698,2.372463,2.111536,2.747444,2.658388,2.446679,2.574739
75%,5.0,5.0,3.430281,3.669131,3.641619,3.532215,3.502984,3.073264,3.792871,3.667751,3.679106,3.721038
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
