# Autokeras

In [1]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.metrics import AUC, Precision, Recall

import autokeras as ak

2024-02-09 08:59:13.565882: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-09 08:59:13.611635: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-09 08:59:13.611669: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-09 08:59:13.612835: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-09 08:59:13.620164: I tensorflow/core/platform/cpu_feature_guar

Using TensorFlow backend


In [2]:
def read_csv_to_dataframe(csv_file_path, delimiter=","):
    """ """
    try:
        # Use pandas to read the CSV file
        dataframe = pd.read_csv(csv_file_path, delimiter=delimiter)
        return dataframe
    except Exception as e:
        print(f"An error occurred while reading the CSV file: {e}")
        return None

In [3]:
features_file_path = "interview.X.csv"
target_file_path = "interview.y.csv"

features_df = read_csv_to_dataframe(features_file_path)
target_df = read_csv_to_dataframe(target_file_path)

merged_df = features_df.merge(target_df, on="uid", how="left")

del features_df
del target_df

In [4]:
# Убрать все значения где tag == None
merged_df.dropna(subset=["tag"], inplace=True)
merged_df.shape

(15392, 11)

In [5]:
# Убираем tag и оставляем только столбец для классификации
merged_df["is_fclick"] = (merged_df["tag"] == "fclick").astype(int)
merged_df.drop("tag", axis=1, inplace=True)

In [6]:
merged_df.drop_duplicates(inplace=True)
merged_df.reset_index(drop=True, inplace=True)

In [7]:
merged_df.shape

(12054, 11)

In [8]:
X = merged_df.drop(columns=["uid", "is_fclick"])
y = merged_df["is_fclick"]
del merged_df

In [9]:
X["reg_time"] = pd.to_datetime(X["reg_time"])

X["year"] = X["reg_time"].dt.year
X["month"] = X["reg_time"].dt.month
X["day"] = X["reg_time"].dt.day
X["weekday"] = X["reg_time"].dt.weekday

X = X.drop("reg_time", axis=1)

In [10]:
X['site_id'] = X['site_id'].str.replace('www.', '')

  X['site_id'] = X['site_id'].str.replace('www.', '')


In [11]:
def encode_categories_to_int(dataframe, categoricals):
    """Encodes categorical features contained in `categoricals` list."""

    encoder = {}

    for col in categoricals:
        encoder[col] = {}
        categories = sorted(list(dataframe[col].dropna().astype(str).unique()), key=str)

        for idx, cat in enumerate(categories):
            encoder[col][cat] = idx

        dataframe[col] = dataframe[col].fillna(-1).apply(lambda val: encoder[col].get(str(val), -1))

    return encoder

def decode_int_to_original_category(encoder, dataframe, columns):
    """Decodes integer encoded categorical features."""

    decoded_cols = []

    for col in columns:
        temp_series = []

        for int_val in dataframe[col]:
            temp_series.append(encoder[col].get(int_val, 'Unknown'))

        decoded_col = pd.Series(temp_series, name=col)
        decoded_cols.append(decoded_col)

    decoded_df = pd.concat(decoded_cols, axis=1)
    return decoded_df

In [12]:
X

Unnamed: 0,fc_imp_chk,fc_time_chk,utmtr,mm_dma,osName,model,hardware,site_id,year,month,day,weekday
0,0,7,6,517,Android,SM-N960U,Mobile Phone,dotesports.com,2021,7,21,2
1,0,7,6,505,Android,Pixel 4a,Mobile Phone,online.seterra.com,2021,7,21,2
2,0,7,6,501,iOS,iPhone,Mobile Phone,hiphopdx.com,2021,7,21,2
3,0,7,6,517,Android,SM-G991U,Mobile Phone,vivaelbirdos.com,2021,7,21,2
4,0,7,6,517,Android,SM-N975U,Mobile Phone,landgrantholyland.com,2021,7,21,2
...,...,...,...,...,...,...,...,...,...,...,...,...
12049,0,7,5,556,iOS,iPhone,Mobile Phone,digitalspy.com,2021,8,6,4
12050,0,7,5,556,iOS,iPhone,Mobile Phone,digitalspy.com,2021,8,6,4
12051,1,6,5,0,Android,SM-G975U,Mobile Phone,digitalspy.com,2021,8,6,4
12052,3,6,5,567,iOS,iPhone,Mobile Phone,whatculture.com,2021,8,6,4


In [13]:
categoricals = ['osName', 'model', 'hardware', 'site_id']
encodings = encode_categories_to_int(X, categoricals)
encodings

{'osName': {'Android': 0,
  'Linux': 1,
  'Symbian': 2,
  'Windows 10': 3,
  'Windows 7': 4,
  'iOS': 5},
 'model': {'1930': 0,
  '3632A': 1,
  '5005R': 2,
  '5007Z': 3,
  '5032W': 4,
  '5049S': 5,
  '5062W': 6,
  '5062Z': 7,
  '7': 8,
  '7.2': 9,
  '7A': 10,
  'A3': 11,
  'A501DL': 12,
  'A502DL': 13,
  'A507DL': 14,
  'A574BL': 15,
  'A6003': 16,
  'A6013': 17,
  'ALP-L09': 18,
  'Alpha 20': 19,
  'Armor 9': 20,
  'BE2012': 21,
  'BE2015': 22,
  'BE2025': 23,
  'BE2028': 24,
  'C2 Tennen': 25,
  'CLT-L29': 26,
  'CP3705A': 27,
  'Chrome - Linux': 28,
  'Chrome - Windows': 29,
  'E6910': 30,
  'E71': 31,
  'EA1002': 32,
  'ELE-L29': 33,
  'EML-L29': 34,
  'Edge': 35,
  'Edge +': 36,
  'F1': 37,
  'G011A': 38,
  'G011C': 39,
  'G013C': 40,
  'G61': 41,
  'G710': 42,
  'GM1900': 43,
  'GM1901': 44,
  'GM1915': 45,
  'GM1917': 46,
  'Generic Android Mobile': 47,
  'Generic Android Tablet': 48,
  'H8314': 49,
  'HD1905': 50,
  'HD1907': 51,
  'HD1910': 52,
  'HD1925': 53,
  'HMA-L29': 54,

In [14]:
scaler = MinMaxScaler()
X[["fc_imp_chk", "fc_time_chk", "utmtr", "mm_dma"]] = scaler.fit_transform(
    X[["fc_imp_chk", "fc_time_chk", "utmtr", "mm_dma"]]
)

In [15]:
from imblearn.over_sampling import ADASYN

# Предположим, у вас есть матрица признаков X и вектор целевых значений y
# X - это ваши признаки (например, числовые или категориальные)
# y - это целевая переменная (например, бинарная классификация: 0 или 1)

# Создаем экземпляр ADASYN
adasyn = ADASYN(random_state=42)

# Применяем ADASYN к данным
X_resampled, y_resampled = adasyn.fit_resample(X, y)

# Теперь X_resampled и y_resampled содержат сбалансированные данные
# Вы можете использовать их для обучения модели машинного обучения
X_resampled.shape, y_resampled.shape

((16082, 12), (16082,))

In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.3, random_state=42, shuffle=True
)

In [17]:
X

Unnamed: 0,fc_imp_chk,fc_time_chk,utmtr,mm_dma,osName,model,hardware,site_id,year,month,day,weekday
0,0.00,1.0,0.857143,0.586833,0,286,2,340,2021,7,21,2
1,0.00,1.0,0.857143,0.573212,0,158,2,913,2021,7,21,2
2,0.00,1.0,0.857143,0.568672,5,341,2,548,2021,7,21,2
3,0.00,1.0,0.857143,0.586833,0,258,2,1340,2021,7,21,2
4,0.00,1.0,0.857143,0.586833,0,290,2,677,2021,7,21,2
...,...,...,...,...,...,...,...,...,...,...,...,...
12049,0.00,1.0,0.714286,0.631101,5,341,2,321,2021,8,6,4
12050,0.00,1.0,0.714286,0.631101,5,341,2,321,2021,8,6,4
12051,0.25,0.0,0.714286,0.000000,0,247,2,321,2021,8,6,4
12052,0.75,0.0,0.714286,0.643587,5,341,2,1372,2021,8,6,4


In [18]:
clf = ak.StructuredDataClassifier(
    column_names=[
        "fc_imp_chk",
        "fc_time_chk",
        "utmtr",
        "mm_dma",
        "osName",
        "hardware",
        "model",
        "site_id",
        "year",
        "month",
        "day",
        "weekday",
    ],
    column_types={
        "fc_imp_chk":"numerical",
        "fc_time_chk":"numerical",
        "utmtr":"numerical",
        "mm_dma":"numerical",
        "osName":"numerical",
        "hardware": "numerical",
        "model":"numerical",
        "site_id": "numerical",
        "year":"numerical",
        "month":"numerical",
        "day":"numerical",
        "weekday":"numerical"
    },
    max_trials=50,
    overwrite=False,
    tuner="greedy",
)

Reloading Tuner from ./structured_data_classifier/tuner0.json


In [19]:
clf.fit(X_train, y_train, epochs=100, validation_data=(X_test, y_test))

2024-02-09 08:59:18.918161: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-09 08:59:18.967635: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-02-09 08:59:18.971307: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:901] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-

Epoch 1/100


2024-02-09 08:59:24.647225: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f6bab009b30 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-09 08:59:24.647262: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2024-02-09 08:59:24.653450: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-02-09 08:59:24.673181: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
I0000 00:00:1707469164.769392  113853 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

INFO:tensorflow:Assets written to: ./structured_data_classifier/best_model/assets


<keras.src.callbacks.History at 0x7f6bd9f01720>

In [20]:
predictions = clf.predict(X_test)
report = classification_report(y_test, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.64      0.68      2456
           1       0.67      0.75      0.71      2369

    accuracy                           0.69      4825
   macro avg       0.70      0.70      0.69      4825
weighted avg       0.70      0.69      0.69      4825



In [21]:
model = clf.export_model()
model.save("model_autokeras", save_format="tf")

INFO:tensorflow:Assets written to: model_autokeras/assets


INFO:tensorflow:Assets written to: model_autokeras/assets


## Catboost

In [23]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Initialize CatBoostClassifier
model = CatBoostClassifier(
    iterations=50000, learning_rate=0.1, depth=6, loss_function="Logloss", verbose=1000
)

# Fit model
model.fit(X_train, y_train)

# Make predictions
predictions = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# Save model
model.save_model("catboost_classifier.model")

0:	learn: 0.6770006	total: 4.8ms	remaining: 4m
1000:	learn: 0.3206980	total: 4.06s	remaining: 3m 18s
2000:	learn: 0.2806091	total: 8.25s	remaining: 3m 18s
3000:	learn: 0.2583342	total: 12.3s	remaining: 3m 12s
4000:	learn: 0.2440839	total: 16.4s	remaining: 3m 8s
5000:	learn: 0.2337920	total: 20.5s	remaining: 3m 4s
6000:	learn: 0.2263383	total: 24.6s	remaining: 3m
7000:	learn: 0.2207898	total: 28.8s	remaining: 2m 56s
8000:	learn: 0.2163618	total: 32.9s	remaining: 2m 52s
9000:	learn: 0.2128685	total: 37.1s	remaining: 2m 49s
10000:	learn: 0.2099961	total: 41.1s	remaining: 2m 44s
11000:	learn: 0.2075583	total: 45.1s	remaining: 2m 40s
12000:	learn: 0.2054234	total: 49.2s	remaining: 2m 35s
13000:	learn: 0.2037671	total: 53.1s	remaining: 2m 31s
14000:	learn: 0.2023012	total: 57s	remaining: 2m 26s
15000:	learn: 0.2010717	total: 1m	remaining: 2m 22s
16000:	learn: 0.2000215	total: 1m 4s	remaining: 2m 17s
17000:	learn: 0.1991814	total: 1m 8s	remaining: 2m 13s
18000:	learn: 0.1984288	total: 1m 12s	