In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels.stats.api as sms
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif, chi2, f_classif
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Načítanie dát

V predchádzajúcej fáze sme zistili, že dôležité dáta sú len v tabuľkách connections a processes. Nedáva teda zmysel znova pracovať so všetkými dátami. Načítame teda len connections and processes a vytvoríme pre ne novú pipeline.

In [59]:
connections = pd.read_csv('data/081/connections.csv', on_bad_lines='warn', delimiter='\t')
processes = pd.read_csv('data/081/processes.csv', on_bad_lines='warn', delimiter='\t')

In [60]:
data = pd.merge(connections, processes, on=['mwra', 'imei', 'ts'])

In [61]:
data = data.drop(columns=['ts', 'imei'])

In [62]:
X = data.drop(columns='mwra')
y = data['mwra']

Teraz si vytvoríme pipeline na predspracovanie naších dát.

Definujeme, si ktoré stĺpce pripomínajú normálnu distribúciu.

In [63]:
normal = ['c.android.youtube', 'c.android.chrome', 'c.android.gm', 'c.dogalize', 'c.katana', 'c.UCMobile.x86', 
          'c.updateassist', 'p.android.packageinstaller', 'p.android.documentsui', 'p.system', 'p.android.externalstorage', 
          'p.android.chrome', 'p.android.settings', 'p.android.gm', 'p.inputmethod.latin', 'p.process.gapps', 'p.notifier',]

non_normal = ['c.android.vending', 'c.UCMobile.intl', 'c.raider', 'p.android.vending', 'p.katana', 'p.google', 
              'p.android.defcontainer', 'p.simulator', 'p.android.gms', 'p.dogalize', 
              'p.gms.persistent', 'p.browser.provider', 'p.olauncher']


Zdroj: https://stackoverflow.com/questions/52346725/can-i-add-outlier-detection-and-removal-to-scikit-learn-pipeline

In [64]:
class OutlierRemover(TransformerMixin):
    def __init__(self, **kwargs):
        self.threshold = kwargs.pop('neg_conf_val', -10.0)

        self.kwargs = kwargs

    def transform(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)
        lcf = LocalOutlierFactor(**self.kwargs)
        lcf.fit(X)
        print(lcf)
        return ()

    def fit(self, *args, **kwargs):
        return self

In [65]:
normal_preprocessor = Pipeline(steps=[
    ("power_transformer", PowerTransformer(method='yeo-johnson', standardize=True)),
    ("scaler", StandardScaler())]
)
non_normal_preprocessor = Pipeline(steps=[
    ("quantile", QuantileTransformer(output_distribution='normal', random_state=0, n_quantiles=1000)    ),
    ("scaler", StandardScaler())]
)

In [66]:
preprocessor = ColumnTransformer(
    transformers=[
        ("normal", normal_preprocessor, normal),
        ("non_normal", non_normal_preprocessor, non_normal),
    ]
)

In [None]:
preprocessing_pipeline = Pipeline(steps=[
    #('outlier_removal', OutlierRemover()),
    ('preprocessor', preprocessor),
])

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train

Unnamed: 0,c.android.youtube,c.android.chrome,c.android.gm,c.dogalize,c.katana,c.UCMobile.x86,c.updateassist,c.android.vending,c.UCMobile.intl,c.raider,...,p.notifier,p.katana,p.google,p.android.defcontainer,p.simulator,p.android.gms,p.dogalize,p.gms.persistent,p.browser.provider,p.olauncher
10796,12.35901,12.76123,15.20486,14.77364,15.65686,67.65864,48.48560,15.96998,84.97930,72.59362,...,50.77216,25.04440,4.71404,53.30486,93.04972,64.77069,55.01637,14.95486,94.29597,6.03221
13945,9.83714,12.68158,9.33423,15.67938,9.77754,87.35159,57.64731,70.41426,37.32368,71.72083,...,41.89550,59.90736,73.23543,85.00504,53.38429,41.03637,72.70814,56.57483,21.78810,21.31181
3369,13.67615,6.84622,7.37962,12.09579,14.50501,92.70666,51.34238,34.01909,91.75063,8.77940,...,39.15761,42.52790,5.82917,44.95482,66.92792,96.95918,53.06914,16.41730,31.54435,40.87545
14417,10.67743,9.47139,11.59696,14.25648,7.98076,47.10978,54.80350,99.83519,53.01982,51.51468,...,58.29419,25.08306,58.16017,29.86004,52.96125,18.31087,87.08361,40.32422,55.02039,48.36611
8515,12.75614,16.11729,9.74631,13.87215,10.28276,53.97291,53.33532,88.93986,36.22287,84.88589,...,42.37053,80.01012,16.41476,68.23980,45.42676,85.62236,21.03286,79.53227,79.14723,39.85409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,13.89253,7.61448,11.13107,9.06339,11.84964,29.65455,43.13637,33.80155,29.59525,34.38697,...,39.96529,92.82518,20.66890,82.20455,74.62085,78.63652,31.45058,82.32749,8.27562,62.78223
13418,17.18382,15.39026,16.05019,12.30027,10.90511,68.51113,30.21832,14.79709,98.72985,0.68415,...,44.46964,24.23300,90.27077,44.54679,87.77730,28.39800,74.74521,71.36401,66.29039,16.48311
5390,15.30801,12.57503,17.35740,11.18722,12.95948,58.81747,48.23569,66.58496,47.40380,38.52421,...,47.81309,94.12542,96.30961,83.52491,24.62257,74.08663,35.27001,90.95571,92.85199,18.76147
860,9.61260,5.85636,8.03812,16.08970,12.85816,69.88633,46.82327,80.31574,67.90777,72.23798,...,50.51472,5.57560,78.61007,24.20796,22.95974,98.12792,35.33916,11.36623,3.86262,33.73278


# 3.1

## 3.1 A
V tejto časti naiplmenetujeme jednoduchý id3 klasifikátor. 

# 3.2

## 3.2 A

Ako stromový algoritmus použijeme Random Forest.

In [69]:
model_pipeline = Pipeline(steps=[
    ('feature_selection', SelectKBest(score_func=f_classif, k=8)),
    ('classifier', RandomForestClassifier(criterion='entropy', max_depth=8, random_state=0))
])

final_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('classifier', model_pipeline)
])

In [70]:
final_pipeline.fit(X_train, y_train)
y_pred = final_pipeline.predict(X_test)
y_pred_train = final_pipeline.predict(X_train)

print("Accuracy(test):", accuracy_score(y_test, y_pred))
print("Accuracy(train):", accuracy_score(y_train, y_pred_train))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy(test): 0.8908212560386474
Accuracy(train): 0.9083514536522509

Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.82      0.85      1132
         1.0       0.90      0.93      0.92      1973

    accuracy                           0.89      3105
   macro avg       0.89      0.88      0.88      3105
weighted avg       0.89      0.89      0.89      3105



## 3.2 B

In [71]:
model_pipeline = Pipeline(steps=[
    ('feature_selection', SelectKBest(score_func=f_classif, k=10)),
    ('classifier', LogisticRegression())
])

final_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing_pipeline),
    ('classifier', model_pipeline)
])

In [72]:
final_pipeline.fit(X_train, y_train)
y_pred = final_pipeline.predict(X_test)
y_pred_train = final_pipeline.predict(X_train)

print("Accuracy(test):", accuracy_score(y_test, y_pred))
print("Accuracy(train):", accuracy_score(y_train, y_pred_train))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy(test): 0.8933977455716586
Accuracy(train): 0.8969960537972135

Classification Report:
               precision    recall  f1-score   support

         0.0       0.87      0.84      0.85      1132
         1.0       0.91      0.93      0.92      1973

    accuracy                           0.89      3105
   macro avg       0.89      0.88      0.88      3105
weighted avg       0.89      0.89      0.89      3105

