In [1]:
%pylab inline
import numpy as np
import warnings

from scipy import sparse

from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

Populating the interactive namespace from numpy and matplotlib


In [2]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso

In [38]:
from sklearn.ensemble import RandomForestClassifier

In [3]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15, 8)

In [4]:
def read_data_from_file(filename, shape):
    values = list()
    rows = list()
    cols = list()
    
    header = True
    for line in open(filename):
        if header:
            header = False
            continue
        row, col, value = [x for x in line.strip().split(',')]
        row, col = int(row), int(col)
        value = float(value)
        row -= 1
        col -= 1
        values.append(value)
        rows.append(row)
        cols.append(col)
        
    return sparse.csr_matrix((values, (rows, cols)), shape=shape)

In [5]:
X_train = read_data_from_file('X_train.csv', (15000, 30000)).astype(float)
X_test = read_data_from_file('X_test.csv', (15000, 30000)).astype(float)
print (X_train.shape, X_test.shape)

(15000, 30000) (15000, 30000)


In [6]:
from sklearn.preprocessing import scale

X_all = sparse.vstack([X_train, X_test])
X_all = scale(X_all, with_mean=False)
X_train = X_all[:15000, :]
X_test = X_all[15000:, :]

del X_all

In [7]:
def read_labels_from_file(filename, shape):
    labels = np.zeros(shape).astype(int)

    header = True
    for line in open(filename):
        if header:
            header = False
            continue
        row, indeces = line.strip().split(',')
        row = int(row) - 1
        indeces = [int(x) - 1 for x in indeces.split()]
        labels[row, indeces] = 1
    
    return labels

In [8]:
y_train = read_labels_from_file('y_train.csv', (15000, 98))

In [9]:
def write_labels_to_file(labels, filename):
    outfile = open(filename, 'w')
    print("Id,Labels", file=outfile)
    for i, line in enumerate(labels):
        elements = [str(x) for x in list(nonzero(line)[0] + 1)]
        
        print("%d,%s" % (i + 1, ' '.join(elements)), file=outfile)

## Классификация одного лейбла

In [34]:
def classify_one_label(X_train, y_train, X_test, classifier=LogisticRegression()):
    classifier.fit(X_train, y_train)
    return classifier.predict(X_test)

In [39]:
np.mean(cross_val_score(LogisticRegression(), 
                        X_train, y_train[:, 0], cv=5, scoring='f1'))

0.54055609791761949

In [2]:
from scipy.stats import pearsonr

## Мы сейчас будем пытаться извлечь самые важные фичи. 

In [12]:
import pandas as pd

In [11]:
line = "234 34 53"
line.strip().split(" ")

['234', '34', '53']

In [14]:
filename = "correlations.txt"
df = pd.read_csv(filename, delim_whitespace=True,header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,88,89,90,91,92,93,94,95,96,97
0,-0.002590,-0.002910,-0.001821,-0.001728,-0.000784,-0.000320,-0.001309,-0.001284,-0.001341,-0.000347,...,-0.000094,-0.000115,-0.000115,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000133
1,-0.002590,-0.002910,-0.001821,-0.001728,-0.000784,-0.000320,-0.001309,-0.001284,-0.001341,-0.000347,...,-0.000094,-0.000115,-0.000115,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000133
2,-0.002590,-0.002910,-0.001821,-0.001728,-0.000784,-0.000320,-0.001309,-0.001284,-0.001341,-0.000347,...,-0.000094,-0.000115,-0.000115,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000133
3,-0.013726,-0.012326,-0.009654,-0.003684,-0.004155,-0.001696,-0.006939,-0.002492,-0.007108,-0.001838,...,-0.000500,-0.000612,-0.000612,-0.000353,-0.000353,-0.000353,-0.000353,-0.000353,-0.000353,-0.000707
4,-0.003658,-0.004111,-0.002572,-0.002440,-0.001107,-0.000452,-0.001849,-0.001814,-0.001894,-0.000490,...,-0.000133,-0.000163,-0.000163,-0.000094,-0.000094,-0.000094,-0.000094,-0.000094,-0.000094,-0.000188
5,-0.002590,-0.002910,-0.001821,-0.001728,-0.000784,-0.000320,-0.001309,-0.001284,-0.001341,-0.000347,...,-0.000094,-0.000115,-0.000115,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000133
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,0.025744,-0.002910,-0.001821,-0.001728,-0.000784,-0.000320,-0.001309,-0.001284,-0.001341,-0.000347,...,-0.000094,-0.000115,-0.000115,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000133
9,-0.002590,-0.002910,-0.001821,-0.001728,-0.000784,-0.000320,0.050922,-0.001284,-0.001341,-0.000347,...,-0.000094,-0.000115,-0.000115,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000067,-0.000133


In [20]:
positive_correlation_indeces = []

for label in range(df.shape[1]):
    array = df.values[:, label].ravel()
    positive_correlation_indeces.append(np.arange(array.size)[array > 0])

In [24]:
for i in range(98):
    positive_correlation_indeces[i] = np.array(positive_correlation_indeces[i])        

In [33]:
sizes = []
for i in range(98):
    sizes.append(positive_correlation_indeces[i].size)

In [40]:
y_test = np.empty(shape=(X_test.shape[0], 0))

In [41]:
for i in range(y_train.shape[1]):
    val_ind = positive_correlation_indeces[i]
    y_test = np.append(y_test, classify_one_label(
            X_train[:, val_ind], y_train[:, i], X_test[:, val_ind], 
            classifier=).reshape(y_test.shape[0], 1), axis=1)
    if (i % 5 == 0 or i < 5): 
        print(i)

0
1
2
3
4
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95


In [42]:
write_labels_to_file(y_test, 'y_test.csv')

In [None]:
for i in range(y_train.shape[1]):
    val_ind = positive_correlation_indeces[i]
    y_test = np.append(y_test, classify_one_label(
            X_train[:, val_ind], y_train[:, i], X_test[:, val_ind], 
            classifier=).reshape(y_test.shape[0], 1), axis=1)
    if (i % 5 == 0 or i < 5): 
        print(i)

In [53]:
#grid = np.array((10e-4, 3 * 10e-4, 10e-3, 3 * 10e-3, 10e-2, 3 * 10e-2, 10e-1))
grid = np.array([10e-1])

cv_results = np.empty(shape=(0, grid.size), dtype=float)

for i in range(3):
    clf = GridSearchCV(estimator=SVC(), param_grid={'C': grid}, cv=2, scoring='f1', n_jobs=-1)
    
    val_ind = positive_correlation_indeces[i]
    clf.fit(X_train[:, val_ind], y_train[:, i])
    mean_scores = np.array(clf.cv_results_['mean_test_score']).reshape(1, grid.size)
    cv_results = np.append(cv_results, mean_scores, axis=0)
    print(i)

0
1
2


In [54]:
cv_results

array([[ 0.02432846],
       [ 0.02210406],
       [ 0.00278514]])

In [None]:
for item in X_train:
    print(item)

  (0, 623)	13.9560284721
  (0, 892)	7.03057206654
  (0, 1379)	1.49291176873
  (0, 1579)	5.33637208104
  (0, 4071)	5.31328779347
  (0, 6021)	29.4560279503
  (0, 6407)	1.19422014944
  (0, 9067)	9.92435424325
  (0, 9151)	14.6621296719
  (0, 9846)	6.31428658967
  (0, 10237)	14.5747975753
  (0, 11238)	4.33831851486
  (0, 11315)	6.97764338894
  (0, 12847)	1.76338832988
  (0, 14910)	7.51629630292
  (0, 15474)	32.2232295082
  (0, 16467)	2.64944389637
  (0, 16546)	8.68963331063
  (0, 17498)	2.92794573101
  (0, 17554)	2.48162238435
  (0, 17832)	2.63493671694
  (0, 17868)	24.549562822
  (0, 18744)	10.4650450566
  (0, 18933)	11.838901504
  (0, 19083)	18.360344931
  (0, 19183)	60.5161529383
  (0, 19864)	5.40920215668
  (0, 20484)	5.27174910891
  (0, 21526)	10.0872269939
  (0, 23040)	1.50154749103
  (0, 23573)	2.61206995395
  (0, 26310)	6.64712970463
  (0, 27067)	3.10957513641
  (0, 27195)	1.76077005107
  (0, 27593)	2.48166106752
  (0, 27923)	4.9190687081
  (0, 28455)	1.53042658461
  (0, 29153)	103.

In [None]:
def write_X_train_to_file(labels, filename):
    outfile = open(filename, 'w')
    for item in X_train:
        print("%d,%s" % (i + 1, ' '.join(elements)), file=outfile)

In [24]:
for item in X_train[:10]:
    print('lol')

lol
lol
lol
lol
lol
lol
lol
lol
lol
lol


In [31]:
outfile = open('X_trainC++.txt', 'w')

print('476913', file=outfile)

header = True
for line in open('X_train.csv'):
    if header:
        header = False
        continue
    array = line.strip().split(',')
    print(array[0], array[1], array[2], file=outfile)

In [30]:
X_train

<15000x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 476913 stored elements in Compressed Sparse Column format>

In [8]:
outfile = open('y_trainC++.txt', 'w')
for item in y_train.sum(axis=1):
    print(item, file=outfile)

header = True
for line in open('y_train.csv'):
    if header:
        header = False
        continue
    array = line.strip().split(',')
    print(array[0], array[1], file=outfile)

In [55]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.multiclass import OneVsRestClassifier, OutputCodeClassifier
from sklearn.svm import LinearSVC
from numpy import vectorize
from math import atan
vatan = vectorize(atan)

In [61]:
def predict_threshold(predict, thresholds):
    return np.greater(predict, thresholds).astype(int)

In [75]:
class LinearClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, threshold=0.0, n_jobs=-1, class_weight=None):
        self.C = C
        self.threshold = threshold
        self.n_jobs = n_jobs
        self.class_weight = class_weight
    def fit(self, X, y):
        self.clf = OneVsRestClassifier(LinearSVC(C=self.C, class_weight=self.class_weight), n_jobs=self.n_jobs)
        self.clf.fit(X, y)
    def predict(self, X):
        return predict_threshold(self.clf.decision_function(X), [self.threshold for n in range(98)])
    
    def predict_proba(self, X):
        return vatan(self.decision_function(X))

In [76]:
classifier_svm = LinearClassifier(C=10e-5, threshold=-0.5, n_jobs=-1)

In [77]:
classifier_svm.fit(X_train, y_train)

In [78]:
y_test = classifier_svm.predict(X_test)

In [79]:
write_labels_to_file(y_test, 'y_test.csv')

In [70]:
X_train_cv, X_test_cv, y_train_cv, y_test_cv = train_test_split(X_train, y_train)

In [72]:
classifier_svm.fit(X_train_cv, y_train_cv)
f1_score(y_test_cv, classifier_svm.predict(X_test_cv), average='samples')

0.22098566290403168

In [73]:
def choose_C(grid):
    results = []
    for C in grid:
        classifier_svm = LinearClassifier(C, threshold=-0.5, n_jobs=-1)
        classifier_svm.fit(X_train_cv, y_train_cv)
        a = f1_score(y_test_cv, classifier_svm.predict(X_test_cv), average='samples')
        classifier_svm.fit(X_test_cv, y_test_cv)
        b = f1_score(y_train_cv, classifier_svm.predict(X_train_cv), average='samples')
        results.append((a + b) / 2)
    return np.array(results)

In [74]:
choose_C(grid=((10e-5, 10e-4, 10e-3)))

array([ 0.16491039,  0.16451207,  0.1582988 ])