In [1]:
%pylab inline
import numpy as np
import warnings

from scipy import sparse

from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn.svm import SVC

In [3]:
from sklearn.model_selection import cross_val_score

In [4]:
from sklearn.linear_model import Lasso

In [5]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15, 8)

In [6]:
def read_data_from_file(filename, shape):
    values = list()
    rows = list()
    cols = list()
    
    header = True
    for line in open(filename):
        if header:
            header = False
            continue
        row, col, value = [x for x in line.strip().split(',')]
        row, col = int(row), int(col)
        value = float(value)
        row -= 1
        col -= 1
        values.append(value)
        rows.append(row)
        cols.append(col)
        
    return sparse.csr_matrix((values, (rows, cols)), shape=shape)

In [7]:
X_train = read_data_from_file('X_train.csv', (15000, 30000)).astype(float)
X_test = read_data_from_file('X_test.csv', (15000, 30000)).astype(float)
print (X_train.shape, X_test.shape)

(15000, 30000) (15000, 30000)


In [8]:
from sklearn.preprocessing import scale

X_all = sparse.vstack([X_train, X_test])
X_all = scale(X_all, with_mean=False)
X_train = X_all[:15000, :]
X_test = X_all[15000:, :]

del X_all

In [9]:
def read_labels_from_file(filename, shape):
    labels = np.zeros(shape).astype(int)

    header = True
    for line in open(filename):
        if header:
            header = False
            continue
        row, indeces = line.strip().split(',')
        row = int(row) - 1
        indeces = [int(x) - 1 for x in indeces.split()]
        labels[row, indeces] = 1
    
    return labels

In [10]:
y_train = read_labels_from_file('y_train.csv', (15000, 98))


In [11]:
def write_labels_to_file(labels, filename):
    outfile = open(filename, 'w')
    print("Id,Labels", file=outfile)
    for i, line in enumerate(labels):
        elements = [str(x) for x in list(nonzero(line)[0] + 1)]
        
        print("%d,%s" % (i + 1, ' '.join(elements)), file=outfile)

## Моё решение (как Бейзлайн)

In [33]:
def classify_one_label(X_train, y_train, X_test):
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    return classifier.predict(X_test)

In [14]:
y_test = np.empty(shape=(X_test.shape[0], 0))

In [36]:
for i in range(y_train.shape[1]):
    y_test = np.append(y_test, classify_one_label(
            X_train, y_train[:, i], X_test).reshape(y_test.shape[0], 1), axis=1)

In [40]:
write_labels_to_file(y_test, 'y_test.csv')

## Пара функций для грид-серч (не работают)

In [18]:
def make_svc(X_train, y_train, X_test):
    algorithm=SVC()
    grid = np.array((0.01, 0.1))
    parameters = {'C': grid}
    classifier = GridSearchCV(estimator=algorithm, param_grid=parameters, cv=5, 
                              scoring=make_scorer(f1_score, average='samples'), 
                              n_jobs=-1)
    classifier.fit(X_train, y_train)
    plt.figure()
    plt.title('choose alpha')
    
    means = classifier.cv_results_['mean_test_score']
    
    plt.xscale('log')

    plt.plot(grid, means, label='mean values of score')
    
    plt.legend()
    plt.show()

In [79]:
def make_lasso(X_train, y_train, X_test):
    algorithm = Lasso()
    #grid = np.array((0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1., 2., 5., 10.))
    grid = np.array((0.01, 0.1))
    parameters = {'alpha': grid}
    
    classifier = GridSearchCV(estimator=algorithm, param_grid=parameters, cv=5, 
                              scoring='roc_auc', n_jobs=-1)
    classifier.fit(X_train, y_train)
    plt.figure()
    plt.title('choose alpha')
    
    means = classifier.cv_results_['mean_test_score']
    
    plt.xscale('log')

    plt.plot(grid, means, label='mean values of score')
    
    plt.legend()
    plt.show()

## Здесь мы стараемся сократить количество признаков с помощью Лассо.

In [37]:
y_test = np.empty(shape=(X_test.shape[0], 0))

In [48]:
def classify_one_label_lasso(X_train, y_train, X_test): 
    algorithm = Lasso(alpha=0.01)
    algorithm.fit(X_train, y_train)
    valid_indices = np.nonzero(algorithm.coef_)[0]
    if valid_indices.size == 0:
        valid_indices = np.arange(X_train.shape[1])
    clf = LogisticRegression()
    clf.fit(X_train[:, valid_indices], y_train)
    
    return clf.predict(X_test[:, valid_indices])

In [49]:
for i in range(y_train.shape[1]):
    y_test = np.append(y_test, classify_one_label_lasso(
            X_train, y_train[:, i], X_test).reshape(y_test.shape[0], 1), axis=1)

In [51]:
write_labels_to_file(y_test, 'y_test.csv')

## Поправим немного

In [62]:
def classify_one_label_lasso_upgraded(X_train, y_train, X_test): 
    algorithm = Lasso(alpha=0.01)
    algorithm.fit(X_train, y_train)
    valid_indices = np.nonzero(algorithm.coef_)[0]
    if valid_indices.size < 1000:
        algorithm = Lasso(alpha=0.001)
        algorithm.fit(X_train, y_train)
        valid_indices = np.nonzero(algorithm.coef_)[0]
        if valid_indices.size < 1000:
            algorithm = Lasso(alpha=0.0001)
            algorithm.fit(X_train, y_train)
            valid_indices = np.nonzero(algorithm.coef_)[0]
            if valid_indices.size < 1000:
                valid_indices = np.arange(X_train.shape[1])
                
    print(valid_indices.size, "\n")
    clf = LogisticRegression()
    clf.fit(X_train[:, valid_indices], y_train)
    
    return clf.predict(X_test[:, valid_indices])

In [63]:
for i in range(y_train.shape[1]):
    y_test = np.append(y_test, classify_one_label_lasso_upgraded(
            X_train, y_train[:, i], X_test).reshape(y_test.shape[0], 1), axis=1)

4085 

4643 

2359 

2277 

6899 

30000 

1491 

1182 

1202 

1005 

1590 

2400 

4569 

1579 

9183 

1762 

1168 

6668 

6151 

2868 

1237 

6935 

1472 

30000 

3908 

7876 

2437 

6697 

1676 

30000 

2100 

1402 

30000 

6602 

1013 

4145 

2100 

1102 

30000 

3276 

1828 

1237 

1084 

5346 

3366 

30000 

1387 

5937 

30000 

4723 

1482 

1028 

1424 

30000 

4239 

1471 

30000 

1384 

3493 

1852 

30000 

1679 

30000 

2239 

3405 

30000 

30000 

1431 

30000 

1960 

3550 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

1452 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 

30000 



In [64]:
write_labels_to_file(y_test, 'y_test.csv')