# Импорт библиотек

In [1]:
import os
import joblib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats import pointbiserialr
from lightgbm import LGBMClassifier

In [2]:
RANDOM_STATE = 42

# Загрузка данных

In [3]:
data_path = os.path.join(os.pardir, "data", "TUANDROMD.csv") 

In [4]:
data = pd.read_csv(data_path)

# EDA

In [5]:
data.head()

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4465 entries, 0 to 4464
Columns: 242 entries, ACCESS_ALL_DOWNLOADS to Label
dtypes: float64(242)
memory usage: 8.2 MB


В датасете 242 бинарные колонки

In [7]:
data[data.isnull().T.any()]

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute,Label
2533,,,,,,,,,,,...,,,,,,,,,,


1 строка с NaN - удалим её

In [8]:
data = data.dropna()

In [9]:
data.duplicated().sum()

3802

In [10]:
data.describe().loc["max"].min()

0.0

In [11]:
data.describe().loc["max"].max()

1.0

Мы видим 3802 полных дубликата, однако они могут быть оправданы особенностями данных, которые состоят только из бинарных колонок

## Таргет

In [12]:
data["Label"].value_counts()

Label
1.0    3565
0.0     899
Name: count, dtype: int64

Видим, что присутствует дисбаланс классов, будем учиытвать это при выборе метрик и лучшей модели

### Корреляции

Првоерим какие колонки коррелируют с тагетом больше всего. Учитывая, что колонки бинарные будем использовать метод

In [13]:
data['Label'] = data['Label'].astype(int)

# Рассчитываем корреляции
correlations = {}
for column in data.columns:
    if column != 'Label':
        corr, _ = pointbiserialr(data[column], data['Label'])
        correlations[column] = corr

# Преобразуем корреляции в DataFrame и сортируем по абсолютному значению
correlation_df = pd.DataFrame.from_dict(correlations, orient='index', columns=['correlation'])
correlation_df['abs_correlation'] = correlation_df['correlation'].abs()
top_correlations = correlation_df.sort_values('abs_correlation', ascending=False).head(10)



In [14]:
top_correlations

Unnamed: 0,correlation,abs_correlation
RECEIVE_BOOT_COMPLETED,0.760417,0.760417
Ljava/net/URL;->openConnection,-0.712211,0.712211
Landroid/location/LocationManager;->getLastK0wnLocation,-0.653875,0.653875
GET_TASKS,0.563214,0.563214
Ljava/lang/System;->load,-0.486265,0.486265
WAKE_LOCK,0.472105,0.472105
Ljava/lang/System;->loadLibrary,-0.47079,0.47079
Ldalvik/system/DexClassLoader;->loadClass,-0.456847,0.456847
KILL_BACKGROUND_PROCESSES,0.438312,0.438312
Landroid/telephony/TelephonyManager;->getSimOperatorName,-0.396342,0.396342


# Train test split

In [15]:
test_size = 0.2

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(columns=["Label"]),
    data["Label"],
    test_size=0.2,
    shuffle=True,
    random_state=RANDOM_STATE,
)

# Modeling

Возьмем 3 различных модели:
- Логистическая регрессия
- Случайный лес
- Градиентный бустинг (в реализации LightGBM)

Построим их с дефолтными параметрами и посмотрим на метрики

## Logistic Regression

In [17]:
log_reg = LogisticRegression(random_state=RANDOM_STATE)
log_reg.fit(X_train, y_train)

## Random Forest

In [18]:
rand_forest = RandomForestClassifier(random_state=RANDOM_STATE)
rand_forest.fit(X_train, y_train)

## Gradient Boosting (LGBM)

In [19]:
lgbm = LGBMClassifier(random_state=RANDOM_STATE)
lgbm.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 2846, number of negative: 725
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001364 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 3571, number of used features: 180
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.796976 -> initscore=1.367498
[LightGBM] [Info] Start training from score 1.367498


## Сравнение моделей

In [20]:
models = {'Logistic Regression': log_reg, 'Random Forest': rand_forest, 'LightGBM': lgbm}
model_performance = {}

for model_name, model in models.items():
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    auc_score = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
    model_performance[model_name] = {'Accuracy': report['accuracy'], 
                                     'Precision': report['weighted avg']['precision'], 
                                     'Recall': report['weighted avg']['recall'],
                                     'F1-Score': report['weighted avg']['f1-score'],
                                     'AUC-ROC': auc_score}
    
model_performance_df = pd.DataFrame(data=model_performance.values(), index=model_performance.keys())

In [21]:
pd.DataFrame(data=model_performance.values(), index=model_performance.keys())

Unnamed: 0,Accuracy,Precision,Recall,F1-Score,AUC-ROC
Logistic Regression,0.984323,0.984658,0.984323,0.984423,0.999061
Random Forest,0.994401,0.994393,0.994401,0.994395,0.999656
LightGBM,0.992161,0.992145,0.992161,0.992135,0.999496


Видим, что метрики всех моделей очень хорошие (несмотря на дисбаланс классов), что вызывает подозорения на дата лик. Однако, без описания данных проверить это не сможем

Всё же чуть лучше метрики у модели Random Forest. Посмотрим какие признаки больше всего влияют на прогноз

In [22]:
feature_importances = rand_forest.feature_importances_
feature_names = rand_forest.feature_names_in_

# Сопоставление важностей с названиями признаков и сортировка
features = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
top_features = features.sort_values(by='Importance', ascending=False).head(10)

In [23]:
top_features

Unnamed: 0,Feature,Importance
149,RECEIVE_BOOT_COMPLETED,0.154621
220,Ljava/net/URL;->openConnection,0.095576
228,Landroid/location/LocationManager;->getLastK0w...,0.072026
86,GET_TASKS,0.060979
191,WAKE_LOCK,0.058397
100,KILL_BACKGROUND_PROCESSES,0.03784
217,Ljava/lang/System;->load,0.032456
138,READ_PHONE_STATE,0.029641
219,Ljava/lang/System;->loadLibrary,0.023947
218,Ldalvik/system/DexClassLoader;->loadClass,0.022712


Признаки похожи на те, что были в топ по корреляции. Особенно выделяется признак RECEIVE_BOOT_COMPLETED

Сохраним модели с помощью joblib

In [24]:
joblib.dump(log_reg, "logreg.joblib")
joblib.dump(rand_forest, "rf.joblib")
joblib.dump(rand_forest, "lgbm.joblib")

['lgbm.joblib']