In [21]:
import pandas as ps
import numpy as np
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

RANDOM_SEED = 142

In [22]:
data = ps.read_csv("data/table-cluster-le25.csv")

In [23]:
data

Unnamed: 0,id1,id2,NAME,A,ADV,ADVPRO,ANUM,APRO,COM,CONJ,INTJ,NUM,PART,PR,S,SPRO,V,is_dup
0,3-52-ZCE391B1DE09F8C63,3-52-ZD4737DFB4F69CE03,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0
1,3-45-ZB8AD126A3E474428,3-45-ZE6427F3B9B3CEDF4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3-45-Z524AC620A947C1E3,3-45-ZB8AD126A3E474428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,3-45-Z25D451AFE808F357,3-45-ZB8AD126A3E474428,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,1.0
4,3-45-ZB8AD126A3E474428,3-45-ZC3DE99C24D879239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0
5,3-45-Z215ADCF3196E929C,3-45-ZB8AD126A3E474428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,0.0
6,3-45-Z5FA182AACD4BA4ED,3-45-ZB8AD126A3E474428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
7,3-45-Z252BA9B11344CBC2,3-45-ZB8AD126A3E474428,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
8,3-45-ZA83686D8AB8A5F2D,3-45-ZB8AD126A3E474428,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
9,3-45-ZB8AD126A3E474428,3-45-ZE45222EE37072AB7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,1.0,0.0


In [24]:
np.random.seed(RANDOM_SEED)
data = data.iloc[np.random.permutation(len(data))]

## Разделение на обучающую и тестовую выборки

In [25]:
all_pos = data[data["is_dup"] == 1]
train_pos = all_pos.iloc[:int(len(all_pos)*0.3)]
all_neg = data[data["is_dup"] == 0]
train_neg = all_neg.iloc[:len(train_pos)]
train = ps.concat([train_pos, train_neg])
print("Размер обучающей выборки:", len(train))

Размер обучающей выборки: 10610


In [29]:
test_pos = all_pos.iloc[len(train_pos):]
test_neg = all_neg.iloc[len(train_neg):200000]
test = ps.concat([test_pos, test_neg])
print("Размер тестовой выборки:", len(test))

Размер тестовой выборки: 207075


## Обучение классификатора

In [30]:
train_X = train.drop(["id1", "id2", "is_dup"], axis=1)
train_y = train["is_dup"]
test_X = test.drop(["id1", "id2", "is_dup"], axis=1)
test_y = test["is_dup"]

In [31]:
model = SVC(C=0.7, kernel='poly', degree=6, random_state=RANDOM_SEED)
model.fit(train_X, train_y)
test_p = model.predict(test_X)
print(classification_report(test_y, test_p, digits = 4))

             precision    recall  f1-score   support

        0.0     0.9801    0.9523    0.9660    194695
        1.0     0.4815    0.6965    0.5694     12380

avg / total     0.9503    0.9370    0.9423    207075



In [11]:
model = SVC(C=0.05, kernel='poly', degree=6, random_state=RANDOM_SEED)
model.fit(train_X, train_y)
test_p = model.predict(test_X)
print(classification_report(test_y, test_p, digits = 4))

             precision    recall  f1-score   support

        0.0     0.9757    0.9732    0.9744    193499
        1.0     0.6687    0.6908    0.6796     15170

avg / total     0.9534    0.9526    0.9530    208669



## Сравнение классификаторов

In [28]:
def all_models(seed = RANDOM_SEED):
    '''
        Сгенерировать все возможные типы моделей.
        Аргументы:
            seed - семя генератора псевдослучайных чисел
        Возвращает: список моделей (не обученных)
    '''
    models = []
    for pen in ("l1", "l2"):
        for tol in range(1, 31):
            tol *= 0.01
            models.append({"model": LogisticRegression(penalty=pen, tol=tol, random_state=seed),
                "txt": "LogReg (pen: {}, tol: {})".format(pen, tol)})
    for C in range(1, 21):
        C *= 0.1
        for kernel in ('linear', 'poly', 'rbf', 'sigmoid'):
            models.append({"model": SVC(C=C, kernel=kernel, random_state=seed),
                "txt": "SVC (C: {}, kernel: {})".format(C, kernel)})
    for weights in ("uniform", "distance"):
        for n in range(1, 40, 3):
            models.append({"model": KNeighborsClassifier(n, weights),
                "txt": "KNeighbors (n: {}, wieghts: {})".format(n, weights)})
    for n in range(3, 61, 3):
        for depth in tuple(range(1, 8)) + (None,):
            for crit in ("gini", "entropy"):
                models.append({"model": RandomForestClassifier(n, crit, depth, random_state=seed),
                    "txt": "RFC (n: {}, crit: {}, depth: {})".format(
                        n, crit, depth)})
    return models

In [34]:
models = all_models()
for mdl in models:
    mdl["model"].fit(train_X, train_y)
    test_p = mdl["model"].predict(test_X)
    res = ps.DataFrame({"y": test_y, "p": test_p}, index=None)
        
    mdl["prec"] = res[res["p"] == 1]["y"].mean()
    mdl["recall"] = res[res["y"] == 1]["p"].mean()
    mdl["f1"] = 2/(1/mdl["prec"] + 1/mdl["recall"])

    print("{}: f1 {:.4f}, prec {:.4f}, recall {:.4f}".format(
        mdl["txt"], mdl["f1"], mdl["prec"], mdl["recall"]))

LogReg (pen: l1, tol: 0.01): f1 0.6227, prec 0.4758, recall 0.9010
LogReg (pen: l1, tol: 0.02): f1 0.6192, prec 0.4702, recall 0.9061
LogReg (pen: l1, tol: 0.03): f1 0.6147, prec 0.4651, recall 0.9063
LogReg (pen: l1, tol: 0.04): f1 0.6147, prec 0.4651, recall 0.9063
LogReg (pen: l1, tol: 0.05): f1 0.6147, prec 0.4651, recall 0.9063
LogReg (pen: l1, tol: 0.06): f1 0.5878, prec 0.4316, recall 0.9211
LogReg (pen: l1, tol: 0.07): f1 0.5878, prec 0.4316, recall 0.9211
LogReg (pen: l1, tol: 0.08): f1 0.6050, prec 0.4541, recall 0.9061
LogReg (pen: l1, tol: 0.09): f1 0.6050, prec 0.4541, recall 0.9061
LogReg (pen: l1, tol: 0.1): f1 0.6050, prec 0.4541, recall 0.9061
LogReg (pen: l1, tol: 0.11): f1 0.6050, prec 0.4541, recall 0.9061
LogReg (pen: l1, tol: 0.12): f1 0.6050, prec 0.4541, recall 0.9061
LogReg (pen: l1, tol: 0.13): f1 0.6050, prec 0.4541, recall 0.9061
LogReg (pen: l1, tol: 0.14): f1 0.6050, prec 0.4541, recall 0.9061
LogReg (pen: l1, tol: 0.15): f1 0.6050, prec 0.4541, recall 0.9

In [35]:
classes = set(mdl["model"].__class__ for mdl in models)
for cls in classes:
    best = max([mdl for mdl in models if mdl["model"].__class__ == cls],
        key=lambda x: x["f1"])
    print("Лучшая модель: {} (f1 {:.4f}, prec {:.4f}, recall {:.4f})".format(
        best["txt"], best["f1"], best["prec"], best["recall"]))

Лучшая модель: RFC (n: 24, crit: gini, depth: 1) (f1 0.5963, prec 0.4612, recall 0.8433)
Лучшая модель: SVC (C: 0.7000000000000001, kernel: poly) (f1 0.6642, prec 0.5404, recall 0.8618)
Лучшая модель: KNeighbors (n: 34, wieghts: uniform) (f1 0.6242, prec 0.4786, recall 0.8974)
Лучшая модель: LogReg (pen: l2, tol: 0.03) (f1 0.6263, prec 0.4808, recall 0.8981)


In [37]:
for degree in range(1, 10):
    mdl = {
        "model": SVC(C=0.7, kernel="poly", degree = degree, random_state=RANDOM_SEED),
        "txt": "SVC (C: 0.7, kernel: poly, degree: {})".format(degree)
    }
        
    mdl["model"].fit(train_X, train_y)
    test_p = mdl["model"].predict(test_X)
    res = ps.DataFrame({"y": test_y, "p": test_p}, index=None)
        
    mdl["prec"] = res[res["p"] == 1]["y"].mean()
    mdl["recall"] = res[res["y"] == 1]["p"].mean()
    mdl["f1"] = 2/(1/mdl["prec"] + 1/mdl["recall"])

    print("{}: f1 {:.4f}, prec {:.4f}, recall {:.4f}".format(
        mdl["txt"], mdl["f1"], mdl["prec"], mdl["recall"]))

SVC (C: 0.7, kernel: poly, degree: 1): f1 0.6099, prec 0.4576, recall 0.9139
SVC (C: 0.7, kernel: poly, degree: 2): f1 0.6386, prec 0.4963, recall 0.8955
SVC (C: 0.7, kernel: poly, degree: 3): f1 0.6642, prec 0.5404, recall 0.8618
SVC (C: 0.7, kernel: poly, degree: 4): f1 0.6772, prec 0.5706, recall 0.8329
SVC (C: 0.7, kernel: poly, degree: 5): f1 0.6822, prec 0.5983, recall 0.7935
SVC (C: 0.7, kernel: poly, degree: 6): f1 0.6841, prec 0.6190, recall 0.7644
SVC (C: 0.7, kernel: poly, degree: 7): f1 0.6822, prec 0.6338, recall 0.7386
SVC (C: 0.7, kernel: poly, degree: 8): f1 0.6809, prec 0.6443, recall 0.7219
SVC (C: 0.7, kernel: poly, degree: 9): f1 0.6778, prec 0.6517, recall 0.7061


In [39]:
for degree in range(3, 9):
    mdl = {
        "model": SVC(C=7, kernel="poly", degree = degree, random_state=RANDOM_SEED),
        "txt": "SVC (C: 7, kernel: poly, degree: {})".format(degree)
    }
        
    mdl["model"].fit(train_X, train_y)
    test_p = mdl["model"].predict(test_X)
    res = ps.DataFrame({"y": test_y, "p": test_p}, index=None)
        
    mdl["prec"] = res[res["p"] == 1]["y"].mean()
    mdl["recall"] = res[res["y"] == 1]["p"].mean()
    mdl["f1"] = 2/(1/mdl["prec"] + 1/mdl["recall"])

    print("{}: f1 {:.4f}, prec {:.4f}, recall {:.4f}".format(
        mdl["txt"], mdl["f1"], mdl["prec"], mdl["recall"]))

SVC (C: 7, kernel: poly, degree: 3): f1 0.6557, prec 0.5259, recall 0.8705
SVC (C: 7, kernel: poly, degree: 4): f1 0.6679, prec 0.5524, recall 0.8446
SVC (C: 7, kernel: poly, degree: 5): f1 0.6733, prec 0.5735, recall 0.8152
SVC (C: 7, kernel: poly, degree: 6): f1 0.6763, prec 0.5914, recall 0.7898
SVC (C: 7, kernel: poly, degree: 7): f1 0.6741, prec 0.5988, recall 0.7711
SVC (C: 7, kernel: poly, degree: 8): f1 0.6730, prec 0.6134, recall 0.7454
