In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv('train.csv')
data = data[list(data.columns[:1]) + list(data.columns[2:]) + list(data.columns[1:2])]
data.head()

Unnamed: 0,id,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,...,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216,label
0,1,7,0,3,0,2,3,0,6,0,...,4,2,2,0,13,0,11,1,3,1
1,2,0,11,0,0,10,1,0,0,4,...,2,0,0,2,8,1,13,0,4,1
2,3,9,0,3,0,1,3,0,4,0,...,11,2,0,0,4,0,2,0,0,0
3,4,0,9,3,2,25,0,4,0,0,...,14,1,0,0,0,3,0,17,1,0
4,5,0,0,0,0,2,5,0,0,0,...,12,0,3,0,4,0,24,4,0,0


Не забудем нормализовать данные - это важно для линейных моделей.

In [3]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1:]
X = (X - X.mean()) / (X.max() - X.min())
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

Сначала я решил сделать логистическую регрессию.

Чтобы она работала получше, я стал выделять главные фичи с помощью CatBoost.

In [211]:
clf = CatBoostClassifier(random_seed=42)
clf = clf.fit(X_train, y_train, verbose=False)

In [212]:
best_features = np.where(np.array(clf.feature_importances_) >= 0.1)[0]

In [213]:
X_train = X_train.iloc[:,best_features]
X_val = X_val.iloc[:,best_features]

In [214]:
ones = np.where(np.array(y_train['label']) == 1)[0]
zeros = np.where(np.array(y_train['label']) == 0)[0]

In [215]:
pairs = []
for i in range(50):
    picked_ones = np.random.choice(len(ones), min(len(ones), len(zeros)))
    picked_zeros = np.random.choice(len(zeros), min(len(ones), len(zeros)))
    picked_ones = np.append(picked_ones, picked_zeros)
    pairs.append(picked_ones)
pairs = np.array(pairs)

In [216]:
predictions = []
for i in range(50):
    logreg = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=-1, random_state=42)
    logreg.fit(X_train.iloc[pairs[i],], np.array(y_train.iloc[pairs[i],]).ravel())
    predictions.append(logreg.predict(X_val))

In [217]:
prediction = []
for i in np.sum(predictions, axis=0):
    if i >= 25:
        prediction.append(1)
    else:
        prediction.append(0)

In [218]:
accuracy_score(prediction, y_val)

0.8844444444444445

In [223]:
X_test = pd.read_csv('test.csv')
X_test = (X_test - X_test.mean()) / (X_test.max() - X_test.min())
X_test = X_test.iloc[:,best_features]
X_test.head()

Unnamed: 0,id,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
0,-0.5,0.099432,-0.034105,-0.006592,-0.072983,0.009825,-0.050858,0.002548,-0.052363,-0.032955,...,-0.006671,-0.031845,0.056615,-0.009628,-0.053083,-0.029066,0.002257,-0.075729,-0.04611,-0.039003
1,-0.499889,0.051813,-0.016561,-0.059224,-0.072983,-0.054005,-0.050858,-0.059952,0.106728,0.036011,...,0.076662,-0.088988,-0.043385,-0.031367,0.013584,0.143773,-0.033457,0.168457,-0.04611,-0.097827
2,-0.499779,-0.091044,-0.069193,-0.059224,0.127017,-0.032728,0.107037,-0.059952,-0.07509,-0.067438,...,-0.027505,0.225298,-0.043385,-0.031367,-0.053083,-0.10314,-0.033457,-0.087357,-0.0705,-0.078219
3,-0.499668,-0.091044,-0.069193,-0.059224,-0.072983,-0.075282,-0.103489,-0.059952,0.061273,-0.067438,...,0.097495,-0.046131,-0.043385,-0.031367,-0.053083,0.057354,-0.033457,0.063806,-0.0705,-0.039003
4,-0.499558,-0.043425,-0.034105,0.019724,-0.072983,-0.032728,-0.103489,-0.059952,0.197637,-0.067438,...,-0.048338,-0.046131,-0.043385,0.012111,-0.053083,0.131428,-0.033457,0.052178,-0.0705,0.039428


In [224]:
predictions = []
for i in range(50):
    logreg = LogisticRegressionCV(cv=5, solver='liblinear', n_jobs=-1, random_state=42)
    logreg.fit(X_train.iloc[pairs[i],], np.array(y_train.iloc[pairs[i],]).ravel())
    predictions.append(logreg.predict(X_test))

In [225]:
prediction = []
for i in np.sum(predictions, axis=0):
    if i >= 25:
        prediction.append(1)
    else:
        prediction.append(0)

In [226]:
csv = pd.DataFrame()
csv["id"] = range(1, 9051)
csv["label"] = pd.DataFrame(prediction).astype(int)
csv.to_csv('logreg_02_25.csv', index=None)

Score в соревновании вышел так себе - надо что-то другое.

Теперь я решил попробовать SVM.

In [50]:
data = pd.read_csv('train.csv')
data = data[list(data.columns[:1]) + list(data.columns[2:]) + list(data.columns[1:2])]
data.head()

Unnamed: 0,id,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,...,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216,label
0,1,7,0,3,0,2,3,0,6,0,...,4,2,2,0,13,0,11,1,3,1
1,2,0,11,0,0,10,1,0,0,4,...,2,0,0,2,8,1,13,0,4,1
2,3,9,0,3,0,1,3,0,4,0,...,11,2,0,0,4,0,2,0,0,0
3,4,0,9,3,2,25,0,4,0,0,...,14,1,0,0,0,3,0,17,1,0
4,5,0,0,0,0,2,5,0,0,0,...,12,0,3,0,4,0,24,4,0,0


In [51]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1:]
X = (X - X.mean()) / (X.max() - X.min())
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)

In [52]:
parameters = {'C': np.logspace(-4, 2, 20),
              'gamma': np.logspace(-4, 2, 20)}
clf = GridSearchCV(SVC(class_weight='balanced'), parameters, scoring='accuracy')
clf = clf.fit(X_train, np.array(y_train).ravel())
accuracy_score(y_val, clf.predict(X_val))

0.8933333333333333

In [53]:
clf.best_params_

{'C': 23.357214690901213, 'gamma': 0.14384498882876628}

In [54]:
X_test = pd.read_csv('test.csv')
X_test = (X_test - X_test.mean()) / (X_test.max() - X_test.min())
X_test.head()

Unnamed: 0,id,feat1,feat2,feat3,feat4,feat5,feat6,feat7,feat8,feat9,...,feat207,feat208,feat209,feat210,feat211,feat212,feat213,feat214,feat215,feat216
0,-0.5,0.099432,-0.034105,-0.006592,-0.072983,0.009825,-0.050858,0.002548,-0.052363,-0.032955,...,-0.006671,-0.031845,0.056615,-0.009628,-0.053083,-0.029066,0.002257,-0.075729,-0.04611,-0.039003
1,-0.499889,0.051813,-0.016561,-0.059224,-0.072983,-0.054005,-0.050858,-0.059952,0.106728,0.036011,...,0.076662,-0.088988,-0.043385,-0.031367,0.013584,0.143773,-0.033457,0.168457,-0.04611,-0.097827
2,-0.499779,-0.091044,-0.069193,-0.059224,0.127017,-0.032728,0.107037,-0.059952,-0.07509,-0.067438,...,-0.027505,0.225298,-0.043385,-0.031367,-0.053083,-0.10314,-0.033457,-0.087357,-0.0705,-0.078219
3,-0.499668,-0.091044,-0.069193,-0.059224,-0.072983,-0.075282,-0.103489,-0.059952,0.061273,-0.067438,...,0.097495,-0.046131,-0.043385,-0.031367,-0.053083,0.057354,-0.033457,0.063806,-0.0705,-0.039003
4,-0.499558,-0.043425,-0.034105,0.019724,-0.072983,-0.032728,-0.103489,-0.059952,0.197637,-0.067438,...,-0.048338,-0.046131,-0.043385,0.012111,-0.053083,0.131428,-0.033457,0.052178,-0.0705,0.039428


In [55]:
clf = SVC(class_weight='balanced', C=23.357214690901213, gamma=0.14384498882876628)
clf = clf.fit(X_train, np.array(y_train).ravel())
prediction = clf.predict(X_test)

In [56]:
csv = pd.DataFrame()
csv["id"] = range(1, 9051)
csv["label"] = pd.DataFrame(prediction).astype(int)
csv.to_csv('svm_11_17.csv', index=None)

На private вышло неплохо - я доволен.