In [292]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [293]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import RFE
from sklearn.svm import SVR

# Fáza 2 - Predspracovanie údajov

## 2.1 Realizácia predspracovania dát

### A - Rozdelenia dát na trénovaciu a testovaciu množinu

#### zlúčime potrebné dáta do jedného datasetu 

In [294]:
# loading datasets
df_processes    = pd.read_csv("data/raw/processes.csv", sep='\t')
df_connections  = pd.read_csv("data/raw/connections.csv", sep='\t')

#### vyberieme si dáta, ktoré sme identifikovali v EDA

In [295]:
df_processes    = df_processes[["imei", "ts", "mwra", "p.android.gm", "p.android.documentsui", "p.system", "p.android.externalstorage", "p.android.settings", "p.android.chrome"]]
df_connections  = df_connections[["imei", "ts", "mwra", "c.katana", "c.dogalize", "c.android.chrome", "c.android.gm"]]

#### spojíme ich pomocou "ts" a "imei"

In [296]:
df = df_connections.merge(df_processes, on=['imei', 'ts'], how='outer')

#### Rozdelíme dáta v pomere 80:20 (trénovacia/testovacia množina). Tento pomer vychádza z predpokladu, že pri dostatočne veľkom množstve dát, máme pri rozdelení dostatok dát na trénovanie a efektívne učenie modelu, zatiaľ čo testovacia množina bude dostatočne veľká pre overenie modelu. Tento pomer je bežný a vhodný pre väčšinu úloh strojového učenia.

In [297]:
TRAIN = 0.8

train_df, test_df = train_test_split(df, train_size=TRAIN, random_state=42)

train_df.head()

Unnamed: 0,imei,ts,mwra_x,c.katana,c.dogalize,c.android.chrome,c.android.gm,mwra_y,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
4229,3590433799317662410,2018-05-08 05:47:00,1.0,7.37476,6.84145,14.51324,8.56085,1.0,10.00362,14.20752,4.78716,10.20058,12.70029,8.13225
12572,8630330696303481594,2018-05-13 20:05:00,1.0,7.58574,12.51114,10.92101,15.39765,1.0,11.05524,8.31027,13.02334,11.1051,7.75706,13.55727
12930,359043379931766650,2018-05-14 02:00:00,0.0,11.17412,12.19228,7.55783,4.73609,0.0,9.57015,7.87946,6.4443,8.625,4.71937,9.00245
14497,3590433799317661156,2018-05-15 03:13:00,0.0,12.71124,17.73296,10.25499,9.71869,0.0,11.8928,9.72879,11.67246,9.45534,4.74844,13.72474
1347,863033069630348750,2018-05-06 07:24:00,1.0,6.97742,13.08809,14.43611,10.57469,1.0,9.81065,5.95691,13.1329,11.79101,7.60488,13.76573


#### ďalej pracujeme s trénovacím datasetom

## B - Transformujeme dáta na vhodný formát pre ML

In [298]:
# better info()
def analyze_dataframe(the_df):
    table = pd.DataFrame({
        'Missing Values': the_df.isnull().sum(),
        'Data Type': the_df.dtypes
    })

    rows, columns = the_df.shape
    duplicates = the_df.duplicated().sum()

    print(f"Shape: {rows} rows, {columns} columns")
    print(f"Duplicates: {duplicates}")

    display(table)
    
analyze_dataframe(train_df)

Shape: 12331 rows, 14 columns
Duplicates: 393


Unnamed: 0,Missing Values,Data Type
imei,0,int64
ts,0,object
mwra_x,0,float64
c.katana,0,float64
c.dogalize,0,float64
c.android.chrome,0,float64
c.android.gm,0,float64
mwra_y,0,float64
p.android.gm,0,float64
p.android.documentsui,0,float64


In [299]:
matches = (train_df['mwra_x'] == train_df['mwra_y']).sum()
not_matches = len(train_df) - matches

print(f"Matched: {matches}, Not matched: {not_matches}")

Matched: 12331, Not matched: 0


#### môžeme vymazať dáta, pomocou ktorých sme spájali datasety do jedného, t.j. nepotrebné stĺpce: ts, mrwa_y + premenujeme mwra_x na mwra

In [300]:
train_df = train_df.drop(columns=['ts', 'mwra_y'], errors='ignore')
train_df = train_df.rename(columns={'mwra_x': 'mwra'})

train_df.head()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
4229,3590433799317662410,1.0,7.37476,6.84145,14.51324,8.56085,10.00362,14.20752,4.78716,10.20058,12.70029,8.13225
12572,8630330696303481594,1.0,7.58574,12.51114,10.92101,15.39765,11.05524,8.31027,13.02334,11.1051,7.75706,13.55727
12930,359043379931766650,0.0,11.17412,12.19228,7.55783,4.73609,9.57015,7.87946,6.4443,8.625,4.71937,9.00245
14497,3590433799317661156,0.0,12.71124,17.73296,10.25499,9.71869,11.8928,9.72879,11.67246,9.45534,4.74844,13.72474
1347,863033069630348750,1.0,6.97742,13.08809,14.43611,10.57469,9.81065,5.95691,13.1329,11.79101,7.60488,13.76573


In [301]:
train_df.describe()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
count,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0
mean,3.838281e+18,0.629389,9.998403,11.147842,11.048739,10.537921,9.970223,9.403644,11.15206,11.715285,9.925758,9.88888
std,3.318556e+18,0.482988,2.585729,2.65839,2.706215,2.715853,2.743161,2.373004,2.319367,2.369038,2.471026,2.567799
min,3.590434e+17,0.0,1.38479,1.44836,1.26488,1.81127,0.0,1.61318,3.05045,3.46351,2.01049,0.87927
25%,8.630331e+17,0.0,8.125205,9.301445,9.16412,8.54286,8.05538,7.79128,9.418615,10.13498,8.10504,8.05774
50%,3.590434e+18,1.0,9.82827,11.24298,11.1035,10.416,9.86639,9.21447,11.33374,11.81994,9.85524,9.90088
75%,8.630331e+18,1.0,11.81316,13.037845,12.97691,12.49305,11.883565,10.73959,12.890415,13.362925,11.820695,11.816795
max,8.630331e+18,1.0,20.03602,21.52206,20.71065,20.88365,20.56361,20.22066,19.38558,20.06016,18.16209,17.99659


In [302]:
def remove_outlier(the_df, the_column):
    if the_column in ['imei', 'mwra']:
        return the_df
    
    lower_bound = the_df[the_column].quantile(0.05)
    upper_bound = the_df[the_column].quantile(0.95)

    outlier_mask = (the_df[the_column] > upper_bound) | (the_df[the_column] < lower_bound)

    the_df.loc[outlier_mask, the_column] = np.where(
        the_df.loc[outlier_mask, the_column] < lower_bound,
        lower_bound,
        upper_bound
    )
    return the_df

for column in train_df.columns:
    train_df = remove_outlier(train_df, column)

train_df.describe()

Unnamed: 0,imei,mwra,c.katana,c.dogalize,c.android.chrome,c.android.gm,p.android.gm,p.android.documentsui,p.system,p.android.externalstorage,p.android.settings,p.android.chrome
count,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0,12331.0
mean,3.838281e+18,0.629389,9.99359,11.150422,11.050504,10.534418,9.975835,9.380079,11.15052,11.715741,9.932958,9.896109
std,3.318556e+18,0.482988,2.388854,2.463512,2.49272,2.517747,2.514924,2.135578,2.175924,2.192739,2.312348,2.397355
min,3.590434e+17,0.0,6.002805,6.65621,6.561885,6.316785,5.623535,5.86077,7.201735,7.63846,5.96703,5.63383
25%,8.630331e+17,0.0,8.125205,9.301445,9.16412,8.54286,8.05538,7.79128,9.418615,10.13498,8.10504,8.05774
50%,3.590434e+18,1.0,9.82827,11.24298,11.1035,10.416,9.86639,9.21447,11.33374,11.81994,9.85524,9.90088
75%,8.630331e+18,1.0,11.81316,13.037845,12.97691,12.49305,11.883565,10.73959,12.890415,13.362925,11.820695,11.816795
max,8.630331e+18,1.0,14.431685,15.35379,15.3746,15.097205,14.56645,13.775495,14.68506,15.466515,13.907715,13.953235


#### aplikovali sme vymazanie odľahlích hodnôt na všetky stĺpce v trénovacej množine, pričom sme použili percentyl 5 a 95, z dôvodu zachovania normálnych hodnôt, no vyradzujeme extrémne hodnoty na oboch stranách

In [303]:
def handle_missing_values(the_df):
    for column in the_df.columns:
        if the_df[column].isnull().sum() > 0:
            median_value = the_df[column].median()
            the_df[column].fillna(median_value, inplace=True)
            # df.dropna(inplace=True)
    return the_df

#### vytvorili sme funkciu na nahradenie missing values pre každý riadok

In [304]:
def normalize_data(the_df):    
    scaler = StandardScaler()
    numeric_columns = the_df.select_dtypes(include=['float64', 'int64']).columns
    the_df[numeric_columns] = scaler.fit_transform(the_df[numeric_columns])
    return the_df

#### takisto pre normalizáciu dát

In [305]:
def remove_duplicates(the_df):
    the_df = the_df.drop_duplicates()
    return the_df

#### a aj na odstránenie duplikátov, ktoré potom aplikujeme na trénovaciu množinu

In [306]:
train_df    = handle_missing_values(train_df)
train_df    = normalize_data(train_df)
train_df    = remove_duplicates(train_df)

analyze_dataframe(train_df)

Shape: 11938 rows, 12 columns
Duplicates: 0


Unnamed: 0,Missing Values,Data Type
imei,0,float64
mwra,0,float64
c.katana,0,float64
c.dogalize,0,float64
c.android.chrome,0,float64
c.android.gm,0,float64
p.android.gm,0,float64
p.android.documentsui,0,float64
p.system,0,float64
p.android.externalstorage,0,float64


In [310]:
def get_top_correlations(the_df):
    numerical_columns = the_df.select_dtypes(include=[np.number])
    correlation_matrix = numerical_columns.corr()
    
    target_correlations = correlation_matrix["mwra"].drop("mwra")
    features = target_correlations.abs().sort_values(ascending=False).head(5)
    
    return features

features1 = get_top_correlations(train_df)
print(features1)

c.katana                 0.573300
c.android.chrome         0.548809
p.android.gm             0.537223
p.system                 0.302558
p.android.documentsui    0.301447
Name: mwra, dtype: float64


In [314]:
def get_top_rfe(X, y):

    estimator = SVR(kernel="linear")
    selector = RFE(estimator, n_features_to_select=5, step=1)
    selector.fit(X, y)

    features = X.columns[selector.support_].tolist()
    return features

X = train_df.drop(columns=['mwra'])
y = train_df['mwra']

features2 = get_top_rfe(X, y)
print("Top 5 selected features:", features2)


Top 5 selected features: ['c.katana', 'c.android.chrome', 'c.android.gm', 'p.android.gm', 'p.system']
