In [17]:
#imports
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.cross_validation import train_test_split, cross_val_predict, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler
from copy import copy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error as MSE
from math import sqrt
import xgboost as xgb
import random

###Несколько полезных функций

In [81]:
#from .dat to pandas.DataFrame
def get_data(filename):
    f = open(filename, 'r')
    array = []
    names_of_columns_was_read = False
    for line in f:
        values = line.split(',')
        values[-1] = values[-1][:-1]
        if names_of_columns_was_read is False:
            for i in xrange(1, len(values)):
                values[i] = values[i][1:]
            names_of_columns_was_read = True
        else:
            for i in xrange(len(values)):
                if values[i] == '?':
                    values[i] = np.nan
                elif '.' in values[i]:
                    values[i] = float(values[i])
                elif values[i].isdigit():
                    values[i] = int(values[i])
        array.append(values)

    data = pd.DataFrame(array[1:], columns=array[0])
    return data

In [5]:
#функция, делающая различные замены пропусков
def imputMissingValues(X, Y):
    X_columns = X.columns
    #замена 1: случайное значение

    X_random = X.copy()
    nan_values = X.isnull()
    for column_name in X_columns:
        indices = list(nan_values[nan_values[column_name] == True].index.values)
        for i in indices:
            X_random.set_value(i, column_name, list(X[column_name].dropna().sample(random_state=42))[0])

    #замена 2: среднее

    X_mean = X.fillna(X.mean())

    #замена 3: медиана

    X_median = X.fillna(X.median())

    #замена 4: мода

    X_mode = X.fillna(X.mode().iloc[0])

    #разбиение выборки по классам
    values_of_Y = Y.value_counts()
    classes = values_of_Y.index.values.tolist()
    classes_dataframes = {}
    for class_value in classes:
        classes_dataframes[class_value] = data[data[columns[-1]] == class_value][X_columns]

    #замена 5: среднее по классу

    X_class_mean = X.copy()
    for column_name in X_columns:
        values_to_set = {}
        for class_value in classes:
            class_mean = classes_dataframes[class_value][column_name].dropna().mean()
            if class_mean != np.nan: #случай когда все сэмплы со значением класса пустые
                values_to_set[class_value] = class_mean

        indices = list(nan_values[nan_values[column_name] == True].index.values)
        for i in indices:
            class_value = Y.loc[i]
            if class_value in values_to_set.keys():
                X_class_mean.set_value(i, column_name, values_to_set[class_value])

    #замена 3: медиана по классу

    X_class_median = X.copy()
    for column_name in X_columns:
        values_to_set = {}
        for class_value in classes:
            class_median = classes_dataframes[class_value][column_name].dropna().median()
            if class_median != np.nan:
                values_to_set[class_value] = classes_dataframes[class_value][column_name].dropna().median()

        indices = list(nan_values[nan_values[column_name] == True].index.values)
        for i in indices:
            class_value = Y.loc[i]
            if class_value in values_to_set.keys():
                X_class_median.set_value(i, column_name, values_to_set[class_value])

    #замена 4: мода по классу
    #FIXME значение в случае отсутствия моды по классу (сейчас такие случаи просто игнорируются) 

    X_class_mode = X.copy()
    for column_name in X_columns:
        values_to_set = {}
        for class_value in classes:
            #values_to_set[class_value] = classes_dataframes[class_value][column_name].dropna().mode().ix[0, :]
            class_mode = classes_dataframes[class_value][column_name].dropna().mode()
            if class_mode.empty == False:
                values_to_set[class_value] = class_mode.iloc[0]

        indices = list(nan_values[nan_values[column_name] == True].index.values)
        for i in indices:
            class_value = Y.loc[i]
            if class_value in values_to_set.keys():
                X_class_mode.set_value(i, column_name, values_to_set[class_value])
            
    return (X_random, X_mean, X_median, X_mode, X_class_mean, X_class_median, X_class_mode)

In [6]:
#функция для отделения классифицируемых признаков от регрессируемых :) (эвристическая - если меньше 
#max_number_of_values_per_class возможных значений признака, то он признается классифицируемым)
def devide_features_to_classifiable_and_regressiable(df, max_number_of_values_per_class):
    columns = list(df.columns.values)
    devided_features = {'class':[], 'regr':[]}
    for column in columns:
        if len(df[column].value_counts().index) < max_number_of_values_per_class:
            devided_features['class'].append(column)
        else:
            devided_features['regr'].append(column)
    return devided_features

In [7]:
#разбиение выборки
def get_X_and_y_by_column_name(X, current_X_columns, column_name):
    current_X = X[current_X_columns].dropna()
    current_X_indices = list(current_X.index.values)
    
    current_y = X[column_name]
    current_y_nan_values = current_y.isnull()
    
    y_train = pd.Series()
    X_train = pd.DataFrame(columns=current_X_columns)
    X_test = pd.DataFrame(columns=current_X_columns)
    for index in current_X_indices:
        if current_y_nan_values.loc[index] == True:
            X_test.loc[index] = list(current_X.loc[index])
        else:
            y_train.set_value(index, current_y.loc[index])
            X_train.loc[index] = list(current_X.loc[index])
            
    return (current_X, X_train, y_train, X_test)

In [8]:
from sklearn.preprocessing import Imputer

#разбиение выборки с заполнением пустых значений на обучающей части выборки средним
def get_X_and_y_by_column_name_with_imputs(X, current_X_columns, column_name):
    current_X = pd.DataFrame(Imputer(strategy='median').fit_transform(X[current_X_columns]), columns=current_X_columns)
    
    current_y_nan_values = X[column_name].isnull()
    
    y_train = X[column_name][current_y_nan_values == False]
    X_train = current_X[current_X_columns][current_y_nan_values == False]
    X_test = current_X[current_X_columns][current_y_nan_values == True]
            
    return (current_X, X_train, y_train, X_test)

In [9]:
#подогнать y под корректные значения для PyBrain классификатора
def fit_y(y):
    unique_values = list(y.unique())
    
    values_map = {}
    for index in xrange(len(unique_values)):
        values_map[unique_values[index]] = index
        
    fitted_y = pd.Series()
    for index in y.index:
        fitted_y.set_value(index, values_map[y.loc[index]])
        
    return (fitted_y, values_map)

In [10]:
#дешифратор для предыдущей функции
def decrypt_y(y_fitted, values_map):
    decrypt_map = {}
    for key, value in values_map.iteritems():
        decrypt_map[value] = key
    
    y = pd.Series()
    for index in y_fitted.index:
        y.set_value(index, decrypt_map[y_fitted.loc[index]])
        
    return y

In [11]:
#функция, вычисляющая метрику
def closest_fit_metric(a, b, **kwargs):
    max_min_differences = []
    for key, value in kwargs.items():
        if key == 'max_min_differences':
            max_min_differences = value
        else:
            raise ValueError('Unexpeceted parameter ' + key)
    
    if max_min_differences == 0.0:
        raise ValueError('max_min_differences should be intialized to use this close-fit metric')
            
    d = len(a)
    dist = 0.0
    for i in xrange(d):
        if a[i] != b[i]:
            data_type = type(a[i])
            if data_type == np.int64 or data_type == np.float64:
                dist += abs(a[i] - b[i])/max_min_differences[i]
            else:
                dist += 1.0
    return dist

In [12]:
import operator

def sort_and_print_results(results):
    results_sorted_by_maxAcc = sorted(results.iteritems(), key=operator.itemgetter(1), reverse=True)
    for item in results_sorted_by_maxAcc:
        print item[1], item[0]

In [13]:
#создадим массив из разниц максимального и минимального значения для признака

def get_max_min_differences_array_from_data(X):
    max_values = [X.iloc[0][j] for j in xrange(len(X.columns))]
    min_values = [value for value in max_values]
    
    for sample_index in xrange(1, len(X.index)):
        for column_index in xrange(len(X.columns)):
            cell_value = X.iloc[sample_index][column_index]
            
            if type(cell_value) == np.int64 or type(cell_value) == np.float64:
                if cell_value > max_values[column_index]:
                    max_values[column_index] = cell_value

                elif cell_value < min_values[column_index]:
                    min_values[column_index] = cell_value
                            
    max_min_differences = []
    for index in xrange(len(max_values)):
        if type(max_values[index]) == np.int64 or type(max_values[index]) == np.float64:
            max_min_differences.append(max_values[index] - min_values[index])
        else:
            max_min_differences.append(0.0)
    
    return max_min_differences

In [14]:
def neighbors(X, y, results, method_name, min_neigh_num, max_neigh_num):
    max_acc = 0
    best_neigh = 0
    epsilon = 0.005
    
    for neighbors_num in xrange(min_neigh_num, max_neigh_num):
        knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = neighbors_num)

        X_scaled = StandardScaler().fit_transform(X)
        acc_score = cross_val_score(knn, X_scaled, y, cv=5)
        mean_acc_score = acc_score.mean()

        if mean_acc_score > max_acc + epsilon:
            max_acc = mean_acc_score
            best_neigh = neighbors_num
            
    results[method_name] = (max_acc, best_neigh)

In [15]:
from rep.estimators.pybrain import PyBrainClassifier

def pybrain_classification(X, y, results, method_name):
    y_fitted, values_map = fit_y(y)
    
    pb = PyBrainClassifier(layers=[10], epochs=10, verbose=False)
    
    acc_score = cross_val_score(pb, X, y_fitted, cv=5)
    results[method_name] = acc_score.mean()

In [16]:
from sklearn import svm

def SVMClassification(X, y, results, method_name):
    scaled_X = StandardScaler().fit_transform(X)
    
    svc_acc = cross_val_score(svm.SVC(), scaled_X, y, cv=5).mean()
    nusvc_acc = cross_val_score(svm.NuSVC(), scaled_X, y, cv=5).mean()
    linearsvc_acc = cross_val_score(svm.LinearSVC(), scaled_X, y, cv=5).mean()
    
    acc_map = {svc_acc: 'SVC', nusvc_acc: 'NuSVC', linearsvc_acc: 'LinearSVC'}
    max_acc = max(svc_acc, nusvc_acc, linearsvc_acc)
    
    results[method_name] = (max_acc, acc_map[max_acc])

In [17]:
def SVMMultiClassification(X, y, results, method_name):
    scaled_X = StandardScaler().fit_transform(X)
    
    svc_acc = cross_val_score(svm.SVC(decision_function_shape='ovo'), scaled_X, y, cv=5).mean()
    nusvc_acc = cross_val_score(svm.NuSVC(decision_function_shape='ovo'), scaled_X, y, cv=5).mean()
    linearsvc_acc = cross_val_score(svm.LinearSVC(), scaled_X, y, cv=5).mean()
    
    acc_map = {svc_acc: 'SVC', nusvc_acc: 'NuSVC', linearsvc_acc: 'LinearSVC'}
    max_acc = max(svc_acc, nusvc_acc, linearsvc_acc)
    
    results[method_name] = (max_acc, acc_map[max_acc])

In [18]:
from sklearn.linear_model import LogisticRegression as lr

def logisticRegClassification(X, y, results, method_name):
    scaled_X = StandardScaler().fit_transform(X)
    
    newton_acc = cross_val_score(lr(solver='newton-cg'), scaled_X, y, cv=5).mean()
    lbfgs_acc = cross_val_score(lr(solver='lbfgs'), scaled_X, y, cv=5).mean()
    liblinear_acc = cross_val_score(lr(solver='liblinear'), scaled_X, y, cv=5).mean()
    sag_acc = cross_val_score(lr(solver='sag'), scaled_X, y, cv=5).mean()
    
    acc_map = {newton_acc: 'newton-cg', lbfgs_acc: 'lbfgs', liblinear_acc: 'liblinear', sag_acc: 'sag'}
    max_acc = max(newton_acc, lbfgs_acc, liblinear_acc, sag_acc)
    
    results[method_name] = (max_acc, acc_map[max_acc])

In [19]:
def random_forest_classification(X, y, results, method_name, min_trees_num, max_trees_num, step):
    
    max_acc_score = 0.0
    best_trees_num = min_trees_num
    epsilon = 0.005
    
    trees_numbers = [(min_trees_num + 3*i) for i in xrange((max_trees_num - min_trees_num)/3)]
    
    for trees_num in trees_numbers:
        
        rf = RandomForestClassifier(n_estimators=trees_num)
        
        X_scaled = StandardScaler().fit_transform(X)
        
        acc_score = cross_val_score(rf, X_scaled, y, cv=5)
        acc_score = acc_score.mean()
        
        if acc_score > max_acc_score + epsilon:
            best_trees_num = trees_num
            max_acc_score = acc_score
            
    results[method_name] = (max_acc_score, best_trees_num)

In [20]:
from sklearn.naive_bayes import GaussianNB

def NBClassification(X, y, results, method_name):
    scaled_X = StandardScaler().fit_transform(X)

    results[method_name] = cross_val_score(GaussianNB(), scaled_X, y, cv=5).mean()

In [21]:
def getNumAndCatfeatures(X):
    num_columns = []
    cat_columns = []
    
    for column in X.columns:
        if type(X[column].dropna().iloc[0]) == str:
            cat_columns.append(column)
        else:
            num_columns.append(column)
            
    return (num_columns, cat_columns)

In [22]:
#удалить пропуски и подсоединить таргет
def dropnaAndAddTarget(X, y, y_column):

    data = X.copy()
    data[y_column] = y.copy()
    
    data = data.dropna()
    
    newX = data[X.columns]
    newY = data[y_column]
    
    return (newX, newY)

In [23]:
#собрать y по X
def fitYtoX(X, Y):
    y = pd.Series()
    for index in list(X.index.values):
        y.set_value(index, Y.loc[index])
    
    return y

In [24]:
def addImputedValuesToDF(df, values_to_set, indexes_of_imputed_values, column_name):
    counter = 0
    for index in indexes_of_imputed_values:
        df.set_value(index, column_name, values_to_set[counter])
        counter += 1

###Датасет mammographic

In [29]:
data = get_data('mammographic.dat')
data[:5]

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5,67,3,5,3.0,1
1,4,43,1,1,,1
2,5,58,4,5,3.0,1
3,4,28,1,1,3.0,0
4,5,74,1,5,,1


In [30]:
#выделям из данных Х и Y
columns = list(data.columns.values)
X_columns = columns[:-1]
y_column = columns[-1]
X = data[X_columns]
Y = data[y_column]

In [31]:
getMissingDataRate(data)

0.02809573361082206

In [81]:
#удалим объекты с пропущенными значениями

data_deleted = data.dropna()
X_deleted = data_deleted[columns[:-1]]
y_deleted = data_deleted[columns[-1]]

In [82]:
devided_features = devide_features_to_classifiable_and_regressiable(X, 10)
devided_features

{'class': ['BI-RADS', 'Shape', 'Margin', 'Density'], 'regr': ['Age']}

###Замены

In [83]:
X_random, X_mean, X_median, X_mode, X_class_mean, X_class_median, X_class_mode = imputMissingValues(X, Y)

###k-means

In [107]:
par = {}
esr = KMeans(**par)
#esr.set_params(**par)
esr.get_params()

{'copy_x': True,
 'init': 'k-means++',
 'max_iter': 300,
 'n_clusters': 8,
 'n_init': 10,
 'n_jobs': 1,
 'precompute_distances': 'auto',
 'random_state': None,
 'tol': 0.0001,
 'verbose': 0}

In [84]:
from sklearn.cluster import KMeans

In [91]:
def kmeans_imputer(n_clusters, df):
    estr = KMeans(n_clusters=n_clusters, max_iter=300, n_init=100, init='k-means++', n_jobs=2, random_state=282)
    clusters_pred = estr.fit_predict(Imputer().fit_transform(df))
    cluster_centers = estr.cluster_centers_
    
    data_means2 = df.copy()
    for index in range(data_means2.shape[0]):

        row = data_means2.iloc[index].isnull()

        if row.any():
            for column_index in range(row.size):
                if row.iloc[column_index] == True:

                    data_means2.set_value(index, 
                                          data_means2.columns[column_index], 
                                          cluster_centers[clusters_pred[index], column_index])
                    
    return data_means2

In [111]:
data_means3[:10]

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
0,5,67,3.0,5.0,3.0,1
1,4,43,1.0,1.0,2.875856,1
2,5,58,4.0,5.0,3.0,1
3,4,28,1.0,1.0,3.0,0
4,5,74,1.0,5.0,2.920818,1
5,4,65,1.0,3.404422,3.0,0
6,4,70,3.172676,3.404422,3.0,0
7,5,42,1.0,1.945397,3.0,0
8,5,57,1.0,5.0,3.0,1
9,5,60,2.809205,5.0,1.0,1


In [92]:
data_means2 = kmeans_imputer(2, data)
data_means3 = kmeans_imputer(3, data)
data_means4 = kmeans_imputer(4, data)
data_means5 = kmeans_imputer(5, data)
data_means6 = kmeans_imputer(6, data)

In [93]:
results = {}
neighbors(X_deleted, y_deleted, results, 'deleted', 10, 70)
neighbors(data_means2[data_means2.columns[:-1]], 
          data_means2[data_means2.columns[-1]], results, 'means2', 10, 70)
neighbors(data_means3[data_means3.columns[:-1]], 
          data_means3[data_means3.columns[-1]], results, 'means3', 10, 70)
neighbors(data_means4[data_means4.columns[:-1]], 
          data_means4[data_means4.columns[-1]], results, 'means4', 10, 70)
neighbors(data_means5[data_means5.columns[:-1]], 
          data_means5[data_means5.columns[-1]], results, 'means5', 10, 70)
neighbors(data_means6[data_means6.columns[:-1]], 
          data_means6[data_means6.columns[-1]], results, 'means6', 10, 70)
sort_and_print_results(results)

(0.83560556994818658, 53) means5
(0.834558506044905, 52) means6
(0.83248167411070528, 18) deleted
(0.83248056994818653, 48) means2
(0.83144969775474953, 47) means4
(0.82935556994818638, 49) means3


In [109]:
results = {}
neighbors(X_class_mean, Y, results, 'class_mean', 10, 70)
neighbors(X_class_median, Y, results, 'class_median', 10, 70)
sort_and_print_results(results)

(0.84286485319516413, 25) class_median
(0.83766191709844562, 46) class_mean


In [44]:
data['Density'].value_counts()

3    798
2     59
1     16
4     12
Name: Density, dtype: int64

In [40]:
estr.transform(data.dropna()[:10])

array([[ 37.90241392,   1.95897431,  11.98785047,  11.83738734,
         23.49281076],
       [ 29.02241971,   8.48390893,  20.95267719,   3.6316995 ,
         14.69320994],
       [  1.64820393,  38.38073812,  51.03654541,  27.5122312 ,
         15.81857689],
       [ 27.96723108,   9.65370924,  22.07673047,   3.3201359 ,
         13.69382536],
       [ 46.79320695,  10.02879408,   3.81115277,  20.75163341,
         32.39043714],
       [ 12.77496467,  24.42509788,  37.06137199,  13.61933128,
          2.52792766],
       [  6.8702054 ,  30.35791162,  43.00515379,  19.50482375,
          7.88016116],
       [ 30.71466642,   6.86726434,  19.18849538,   5.20636839,
         16.36835444],
       [ 24.71722669,  12.66570918,  25.18005341,   3.06504183,
         10.46366305],
       [ 22.88362726,  14.34916199,  26.97456072,   3.81638403,
          8.58519864]])

###Knn

In [30]:
#метод ближайших соседей
#для начала пойдем самым простым путем - для обучения воспользуемся только полностью заполненными объектами

X_knn = X.copy()

for column_name in devided_features['class']:
    current_X_columns = copy(devided_features['class'])
    current_X_columns.remove(column_name)
    
    #обучение
    current_y = X_deleted[column_name]
    current_X = X_deleted[current_X_columns]
    
    max_acc = 0
    best_neigh_num = 0
    epsilon = 0.005

    for neighbors_num in xrange(3, 100):
        knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = neighbors_num)
        #knn = KNeighborsClassifier(n_neighbors=neighbors_num)
        X_scaled = StandardScaler().fit_transform(current_X)
        mean_acc_score = cross_val_score(knn, X_scaled, current_y, cv=5).mean()

        if mean_acc_score > max_acc + epsilon:
            max_acc = mean_acc_score
            best_neigh_num = neighbors_num

    print max_acc, best_neigh_num
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X, current_X_columns, column_name)
            
    #применение knn
    knn_for_missing_values = KNeighborsClassifier(metric = 'manhattan', n_neighbors=best_neigh_num)
    scaler = StandardScaler().fit(current_X)
    knn_for_missing_values.fit(scaler.transform(X_train), y_train)
    y_test = knn_for_missing_values.predict(scaler.transform(X_test))
    X_test_indices = list(X_test.index.values)
    counter = 0
    for index in X_test_indices:
        X_knn.set_value(index, column_name, y_test[counter])
        counter += 1

0.748063046545 33
0.654197453942 14
0.607098348041 70
0.909677932509 10


In [31]:
#метод ближайших соседей

X_knn1 = X.copy()

for column_name in devided_features['class']:
    current_X_columns = copy(list(X_knn1.columns))
    current_X_columns.remove(column_name)
    
    #обучение
    #current_y = X_deleted[column_name]
    #current_X = X_deleted[current_X_columns]
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name_with_imputs(X_knn1, current_X_columns, column_name)
    
    max_acc = 0
    best_neigh_num = 0
    epsilon = 0.005

    for neighbors_num in xrange(3, 100, 3):
        knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = neighbors_num)
        #knn = KNeighborsClassifier(n_neighbors=neighbors_num)
        X_scaled = StandardScaler().fit_transform(X_train)
        mean_acc_score = cross_val_score(knn, X_scaled, y_train, cv=5).mean()

        if mean_acc_score > max_acc + epsilon:
            max_acc = mean_acc_score
            best_neigh_num = neighbors_num

    print column_name, max_acc, best_neigh_num
            
    #применение knn
    knn_for_missing_values = KNeighborsClassifier(metric = 'manhattan', n_neighbors=best_neigh_num)
    scaler = StandardScaler().fit(current_X)
    knn_for_missing_values.fit(scaler.transform(X_train), y_train)
    y_test = knn_for_missing_values.predict(scaler.transform(X_test))
    X_test_indices = list(X_test.index.values)
    counter = 0
    for index in X_test_indices:
        X_knn1.set_value(index, column_name, y_test[counter])
        counter += 1

BI-RADS 0.733029349577 51
Shape 0.619367737584 18
Margin 0.628626117585 93
Density 0.901733294968 9


In [32]:
#метод ближайших соседей регрессия

from sklearn.neighbors import KNeighborsRegressor

for column_name in devided_features['regr']:
    current_X_columns = copy(list(X_knn1.columns))
    current_X_columns.remove(column_name)
    
    #обучение
    #current_y = X_deleted[column_name]
    #current_X = X_deleted[current_X_columns]
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name_with_imputs(X_knn1, 
                                                                                 current_X_columns, 
                                                                                 column_name)
    
    X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.33, random_state=22)
    
    min_mse = 0.0
    best_neigh_num = 0
    epsilon = 0.005

    for neighbors_num in xrange(3, 100, 3):
        knn = KNeighborsRegressor(n_neighbors = neighbors_num)
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_tr)
        knn.fit(X_scaled, y_tr)
        y_pred = knn.predict(scaler.transform(X_te))
        
        mse = sqrt(MSE(y_te, y_pred))
        
        if neighbors_num == 3:
            min_mse = mse
        elif mse < min_mse - epsilon:
            min_mse = mse
            best_neigh_num = neighbors_num

    print column_name, min_mse, best_neigh_num
            
    #применение knn
    knn_for_missing_values = KNeighborsRegressor(n_neighbors=best_neigh_num)
    scaler = StandardScaler().fit(current_X)
    knn_for_missing_values.fit(scaler.transform(X_train), y_train)
    y_test = knn_for_missing_values.predict(scaler.transform(X_test))
    X_test_indices = list(X_test.index.values)
    counter = 0
    for index in X_test_indices:
        X_knn1.set_value(index, column_name, y_test[counter])
        counter += 1

Age 13.5277002918 90


In [33]:
#для X_knn соберем y_knn
X_knn = X_knn.dropna()
y_knn = pd.Series()
for index in list(X_knn.index.values):
    y_knn.set_value(index, Y.loc[index])

###Closest_fit

In [31]:
%%time
X_closest_fit = X.copy()

max_min_differences = get_max_min_differences_array_from_data(X_deleted)

for column_name in devided_features['class']:
    current_X_columns = copy(devided_features['class'])
    current_X_columns.remove(column_name)
    
    #обучение
    current_y = X_deleted[column_name]
    current_X = X_deleted[current_X_columns]
    
    max_acc = 0
    best_neigh_num = 0
    epsilon = 0.005
    max_min_differences = get_max_min_differences_array_from_data(X_deleted)

    for neighbors_num in xrange(3, 50):
        knn = KNeighborsClassifier(n_neighbors=neighbors_num,
                                   metric=closest_fit_metric,
                                   metric_params={'max_min_differences':max_min_differences})
        X_scaled = StandardScaler().fit_transform(current_X)
        mean_acc_score = cross_val_score(knn, X_scaled, current_y, cv=5).mean()

        if mean_acc_score > max_acc + epsilon:
            max_acc = mean_acc_score
            best_neigh_num = neighbors_num

    print max_acc, best_neigh_num
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X, current_X_columns, column_name)
            
    max_min_differences = get_max_min_differences_array_from_data(current_X)
            
    #применение knn
    knn_for_missing_values = KNeighborsClassifier(n_neighbors=best_neigh_num,
                                                  metric=closest_fit_metric,
                                                  metric_params={'max_min_differences':max_min_differences})
    scaler = StandardScaler().fit(current_X)
    knn_for_missing_values.fit(scaler.transform(X_train), y_train)
    y_test = knn_for_missing_values.predict(scaler.transform(X_test))
    X_test_indices = list(X_test.index.values)
    counter = 0
    for index in X_test_indices:
        X_closest_fit.set_value(index, column_name, y_test[counter])
        counter += 1

0.733615859342 8
0.64458784512 13
0.589089324433 39
0.909677932509 10
CPU times: user 9min 50s, sys: 192 ms, total: 9min 50s
Wall time: 9min 49s


In [33]:
#для X_closest_fit соберем y_closest_fit
X_closest_fit = X_closest_fit.dropna()
y_closest_fit = fitYtoX(X_closest_fit, Y)

###PyBrain

In [36]:
%%time

X_pybrain = X.copy()

for column_name in devided_features['class']:
    current_X_columns = copy(list(X_pybrain.columns))
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X_pybrain, current_X_columns, column_name)
    fitted_y, values_map = fit_y(y_train)
    
    pb_for_missing_values = PyBrainClassifier(layers=[10],
                                              epochs=7,
                                              verbose=False)
    pb_for_missing_values.fit(X_train, fitted_y)
    y_test_fitted = pb_for_missing_values.predict(X_test)
    
    y_test = decrypt_y(pd.Series(y_test_fitted), values_map)
    X_test_indices = list(X_test.index.values)
    counter = 0
    for index in X_test_indices:
        X_pybrain.set_value(index, column_name, y_test[counter])
        counter += 1

CPU times: user 37.6 s, sys: 12 ms, total: 37.6 s
Wall time: 37.6 s


In [58]:
%%time
#svm

X_svm = X.copy()

for column_name in devided_features['class']:
    current_X_columns = list(X_svm.columns)
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X_svm, current_X_columns, column_name)
    
    clf = svm.SVC()
    
    scaler = StandardScaler().fit(X_train)
    
    clf.fit(scaler.transform(X_train), y_train)
    y_test = clf.predict(scaler.transform(X_test))
    
    addImputedValuesToDF(X_svm, y_test, list(X_test.index.values), column_name)

CPU times: user 13.9 s, sys: 12 ms, total: 13.9 s
Wall time: 13.9 s


In [59]:
%%time
#svm
for column_name in devided_features['regr']:
    current_X_columns = list(X_svm.columns)
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X_svm, current_X_columns, column_name)
    
    clf = svm.SVR()
    
    scaler = StandardScaler().fit(X_train)
    
    clf.fit(scaler.transform(X_train), y_train)
    y_test = clf.predict(scaler.transform(X_test))
    
    addImputedValuesToDF(X_svm, y_test, list(X_test.index.values), column_name)

CPU times: user 3.77 s, sys: 8 ms, total: 3.78 s
Wall time: 3.78 s


In [60]:
#для X_svm соберем y_svm
X_svm = X_svm.dropna()
y_svm = fitYtoX(X_svm, Y)

In [41]:
%%time
#linear_regression

X_lr = X.copy()

for column_name in devided_features['class']:
    current_X_columns = list(X_lr.columns)
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X_lr, current_X_columns, column_name)
    
    results = {}
    logisticRegClassification(X_train, y_train, results, 'lr')
    
    clf = lr(solver=results['lr'][1])
    
    scaler = StandardScaler().fit(X_train)
    
    clf.fit(scaler.transform(X_train), y_train)
    y_test = clf.predict(scaler.transform(X_test))
    
    addImputedValuesToDF(X_lr, y_test, list(X_test.index.values), column_name)

CPU times: user 15.5 s, sys: 0 ns, total: 15.5 s
Wall time: 15.5 s


In [42]:
for column_name in devided_features['regr']:
    current_X_columns = list(X_lr.columns)
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X_lr, current_X_columns, column_name)
    
    scaler = StandardScaler().fit(X_train)
                      
    rgr = lr(solver='sag').fit(scaler.transform(X_train), y_train)
    y_pred = rgr.predict(scaler.transform(X_test))
    
    addImputedValuesToDF(X_lr, y_pred, list(X_test.index.values), column_name)



In [43]:
X[X['Age'].isnull() == True]

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density
443,4,,4,5,3
453,5,,4,4,3
683,5,,3,3,3
884,5,,4,4,3
923,5,,4,3,3


In [44]:
X_lr[X['Age'].isnull() == True]

Unnamed: 0,BI-RADS,Age,Shape,Margin,Density
443,4,67,4,5,3
453,5,67,4,4,3
683,5,66,3,3,3
884,5,67,4,4,3
923,5,66,4,3,3


In [35]:
#для X_lr соберем y_lr
X_lr = X_lr.dropna()
y_lr = fitYtoX(X_lr, Y)

In [42]:
%%time
#naive_bayes

X_nb = X.copy()

for column_name in devided_features['class']:
    current_X_columns = list(X_nb.columns)
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X_nb, current_X_columns, column_name)
    
    clf = GaussianNB()
    
    scaler = StandardScaler().fit(X_train)
    
    clf.fit(scaler.transform(X_train), y_train)
    y_test = clf.predict(scaler.transform(X_test))
    
    addImputedValuesToDF(X_nb, y_test, list(X_test.index.values), column_name)

CPU times: user 13.4 s, sys: 4 ms, total: 13.4 s
Wall time: 13.4 s


In [43]:
#для X_nb соберем y_nb
X_nb = X_nb.dropna()
y_nb = fitYtoX(X_nb, Y)

In [45]:
%%time
from rep.estimators.pybrain import PyBrainRegressor

for column_name in devided_features['regr']:
    current_X_columns = copy(list(X_pybrain.columns))
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X_pybrain, current_X_columns, column_name)
    
    rgr = PyBrainRegressor(layers=[10], epochs=10, verbose=True, max_epochs=30)
    rgr.fit(X_train, y_train)
    y_test = rgr.predict(X_test)
    
    X_test_indices = list(X_test.index.values)
    counter = 0
    for index in X_test_indices:
        X_pybrain.set_value(index, column_name, y_test[counter])
        counter += 1

Total error:  110.320670325
Total error:  88.5754404951
Total error:  88.7237514159
Total error:  88.8077870056
Total error:  87.2379428317
Total error:  88.0693606476
Total error:  85.9416885983
Total error:  87.0316186863
Total error:  87.4965161327
Total error:  86.0934201281
CPU times: user 11.1 s, sys: 16 ms, total: 11.1 s
Wall time: 11.1 s


In [46]:
#для Y_pybrain соберем y_pybrain
X_pybrain = X_pybrain.dropna()
y_pybrain = fitYtoX(X_pybrain, Y)

In [39]:
['ls', 'lad', 'huber', 'quantile']

['ls', 'lad', 'huber', 'quantile']

In [29]:
def fromProbaToLabel(y):
    for index in range(len(y)):
        if y[index] < 0.5:
            y[index] = 0
        else:
            y[index] = 1

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_mean, Y, test_size=0.33, random_state=2289)

In [31]:
xgb_X_train = X_train.as_matrix()
xgb_y_train = np.asarray(y_train)

dtrain = xgb.DMatrix(xgb_X_train, label=xgb_y_train)

xgb_X_test = X_test.as_matrix()

dtest = xgb.DMatrix(xgb_X_test)

In [35]:
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic'}
param['nthread'] = 4
param['eval_metric'] = 'error'

num_round = 10
bst = xgb.train(param.items(), dtrain, num_round)

In [36]:
y_pred = bst.predict(dtest)

In [38]:
fromProbaToLabel(y_pred)
accuracy_score(y_test, y_pred)

0.80503144654088055

In [108]:
df = xgb.cv(param.items(), dtrain, num_round, nfold=5, metrics={'error'})
df

Unnamed: 0,test-error-mean,test-error-std,train-error-mean,train-error-std
0,0.384375,0.10203,0.367578,0.128185
1,0.201563,0.01875,0.15,0.016453
2,0.19375,0.027688,0.136328,0.010438
3,0.182813,0.030698,0.128516,0.007851
4,0.179688,0.033146,0.126172,0.005468
5,0.18125,0.031792,0.122266,0.005327
6,0.178125,0.037435,0.118359,0.004385
7,0.175,0.037824,0.117188,0.006652
8,0.175,0.037824,0.116406,0.007967
9,0.176563,0.034799,0.114453,0.006371


In [118]:
def get_dmatrix(X, y):
    #xgb_X = X.as_matrix()
    xgb_y = np.asarray(y)

    return xgb.DMatrix(X, label=xgb_y)

In [None]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

for column_name in devided_features['class']:
    current_X_columns = list(X.columns)
    current_X_columns.remove(column_name)

    current_X, X_train, y_train, _ = get_X_and_y_by_column_name_with_imputs(X,
                                                                            current_X_columns,
                                                                            column_name)
    
    scaler = StandardScaler().fit(current_X)
    label_encoder = LabelEncoder().fit(X[column_name].dropna())
    dtrain = self.get_dmatrix(scaler.transform(X_train), label_encoder.transform(y_train))

    param = {'silent': 1, 'nthread': 4}
    metric = ''
    #how much classes do we classify?
    num_class = y_train.value_counts().shape[0]
    if num_class == 2:
        param['objective'] = 'binary:logistic'
        metric = 'error'
    else:
        metric = 'merror'
        param['objective'] = 'multi:softmax'
        param['num_class'] = num_class

    #tune the best parameters
    best_param = {'1-error': 0}
    epsilon = 0.001
    #these magic numbers used below probably should be changed
    for eta in [0.3 + i*0.1 for i in range(8)]:
        for max_depth in range(2, 11):
            for num_round in range(10, 20):

                param['bst:max_depth'] = max_depth
                param['bst:eta'] = eta
                errors_df = xgb.cv(param, dtrain, num_round, nfold=5, metrics={metric})

                test_mean_error = errors_df.iloc[-1][0]
                if test_mean_error > best_param['1-error'] + epsilon:
                    best_param['1-error'] = test_mean_error
                    best_param['max_depth'] = max_depth
                    best_param['eta'] = eta
                    best_param['num_round'] = num_round

    if self.verbose == 1:
        print(best_param)

    param['bst:max_depth'] = best_param['max_depth']
    param['bst:eta'] = best_param['eta']
    self._classifiers[column_name] = xgb.train(param, dtrain, best_param['num_round'])

In [159]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

for column_name in devided_features['class']:
    current_X_columns = list(X.columns)
    current_X_columns.remove(column_name)

    current_X, X_train, y_train, _ = get_X_and_y_by_column_name_with_imputs(X,
                                                                            current_X_columns,
                                                                            column_name)

    X_tr, X_te, y_tr, y_te = train_test_split(X_train, y_train, test_size=0.33, random_state=22)
    scaler = StandardScaler().fit(current_X)
    scaler_
    dtrain = get_dmatrix(scaler.transform(X_tr), MinMaxScaler().fit_transform(y_tr))

    param = {'silent': 1, 'nthread': 4}
    param['objective'] = 'reg:logistic'    
    param['bst:max_depth'] = max_depth
    param['bst:eta'] = eta

    bst = xgb.train(param, dtrain, num_round)
    y_pred = bst.predict(xgb.DMatrix(scaler.transform(X_te)))
    
    print y_pred
    
    print(sqrt(MSE(y_test, y_pred)))
    
    #errors_df = xgb.cv(param, dtrain, num_round, nfold=5, metrics={'rmse'})
    #print(errors_df)

    """
    #tune the best parameters
    best_param = {'1-error': 0}
    epsilon = 0.001
    #these magic numbers used below probably should be changed
    for eta in [0.3 + i*0.1 for i in range(8)]:
        for max_depth in range(2, 11):
            for num_round in range(10, 20):

                param['bst:max_depth'] = max_depth
                param['bst:eta'] = eta
                
                bst = xgb.train(param, dtrain, num_round)
                
                errors_df = xgb.cv(param, dtrain, num_round, nfold=5, metrics={'rmse'})

                test_mean_error = errors_df.iloc[-1][0]
                if test_mean_error > best_param['1-error'] + epsilon:
                    best_param['1-error'] = test_mean_error
                    best_param['max_depth'] = max_depth
                    best_param['eta'] = eta
                    best_param['num_round'] = num_round

    print(best_param)

    """
    #param['bst:max_depth'] = best_param['max_depth']
    #param['bst:eta'] = best_param['eta']
    #self._clr = xgb.train(param, dtrain, best_param['num_round'])

   test-rmse-mean  test-rmse-std  train-rmse-mean  train-rmse-std
0        0.174738       0.005455         0.170775        0.001456
1        0.170062       0.004276         0.162780        0.001199
2        0.167574       0.004064         0.158405        0.001146
3        0.166828       0.003821         0.155800        0.001132
4        0.166427       0.003254         0.153928        0.001331
5        0.166957       0.003133         0.152782        0.001320
6        0.167592       0.003025         0.151966        0.001307
7        0.167807       0.003064         0.151350        0.001278
8        0.168344       0.003188         0.150736        0.001412
9        0.168852       0.003127         0.150439        0.001437




In [None]:
13.6074833102

###Применяем knn к обработанным данным

In [38]:
%%time
results = {}

neighbors(X_deleted, y_deleted, results, 'deleted', 10, 70)
neighbors(X_random, Y, results, 'random', 10, 70)
neighbors(X_mean, Y, results, 'mean', 10, 70)
neighbors(X_median, Y, results, 'median', 10, 70)
neighbors(X_mode, Y, results, 'mode', 10, 70)
neighbors(X_class_mode, Y, results, 'class_mode', 10, 70)
neighbors(X_class_mean, Y, results, 'class_mean', 10, 70)
neighbors(X_knn, y_knn, results, 'knn_for_miss_val', 10, 70)
neighbors(X_knn1, Y, results, 'knn_with_imputs', 10, 70)


import operator
results_sorted_by_maxAcc = sorted(results.iteritems(), key=operator.itemgetter(1), reverse=True)
for item in results_sorted_by_maxAcc:
    print item[1], item[0]

(0.84182318652849752, 25) class_mode
(0.83766191709844562, 46) class_mean
(0.83350064766839382, 20) knn_with_imputs
(0.83248167411070528, 18) deleted
(0.83246977547495682, 20) mode
(0.83040263385146817, 25) mean
(0.83039723661485321, 33) median
(0.82935556994818638, 52) random
(0.82608303956130036, 13) knn_for_miss_val
CPU times: user 27.4 s, sys: 24 ms, total: 27.4 s
Wall time: 27.4 s


In [47]:
%%time
results = {}

neighbors(X_deleted, y_deleted, results, 'deleted', 10, 70)
neighbors(X_random, Y, results, 'random', 10, 70)
neighbors(X_mean, Y, results, 'mean', 10, 70)
neighbors(X_median, Y, results, 'median', 10, 70)
neighbors(X_mode, Y, results, 'mode', 10, 70)
neighbors(X_class_mode, Y, results, 'class_mode', 10, 70)
neighbors(X_class_mean, Y, results, 'class_mean', 10, 70)
neighbors(X_class_median, Y, results, 'class_median', 10, 70)
neighbors(X_svm, y_svm, results, 'svm', 10, 70)
neighbors(X_lr, y_lr, results, 'lr', 10, 70)
neighbors(X_nb, y_nb, results, 'nb', 10, 70)
neighbors(X_knn, y_knn, results, 'knn_for_miss_val', 10, 70)
neighbors(X_closest_fit, y_closest_fit, results, 'closest_fit', 10, 70)
neighbors(X_pybrain, y_pybrain, results, 'pybrain', 10, 70)

import operator
results_sorted_by_maxAcc = sorted(results.iteritems(), key=operator.itemgetter(1), reverse=True)
for item in results_sorted_by_maxAcc:
    print item[1], item[0]

(0.84286485319516413, 25) class_median
(0.84182318652849752, 25) class_mode
(0.83766191709844562, 46) class_mean
(0.83248167411070528, 18) deleted
(0.83246977547495682, 20) mode
(0.83040263385146817, 25) mean
(0.83039723661485321, 33) median
(0.82935556994818638, 52) random
(0.82825695260477872, 54) closest_fit
(0.82608891500195847, 10) nb
(0.82608303956130036, 13) knn_for_miss_val
(0.82490163442725117, 19) pybrain
(0.82286892081422225, 13) svm
(0.8228338577006179, 18) lr
CPU times: user 42.3 s, sys: 7.98 ms, total: 42.3 s
Wall time: 42.3 s


In [36]:
results = {}
neighbors(X_lr, y_lr, results, 'lr', 10, 70)
print results['lr']

(0.82595377851544838, 21)


###Применим PyBrain к обработанным данным

In [48]:
%%time
results = {}

pybrain_classification(X_deleted, y_deleted, results, 'deleted')
pybrain_classification(X_random, Y, results, 'random')
pybrain_classification(X_mean, Y, results, 'mean')
pybrain_classification(X_median, Y, results, 'median')
pybrain_classification(X_mode, Y, results, 'mode')
pybrain_classification(X_class_mode, Y, results, 'class_mode')
pybrain_classification(X_class_mean, Y, results, 'class_mean')
pybrain_classification(X_class_median, Y, results, 'class_median')
pybrain_classification(X_knn, y_knn, results, 'knn_for_miss_val')
pybrain_classification(X_closest_fit, y_closest_fit, results, 'closest_fit')
pybrain_classification(X_pybrain, y_pybrain, results, 'pybrain')
pybrain_classification(X_svm, y_svm, results, 'svm')
pybrain_classification(X_lr, y_lr, results, 'lr')
pybrain_classification(X_nb, y_nb, results, 'nb')

import operator
results_sorted_by_maxAcc = sorted(results.iteritems(), key=operator.itemgetter(1), reverse=True)
for item in results_sorted_by_maxAcc:
    print item[1], item[0]

0.835743022125 nb
0.835600172712 class_mean
0.831428108808 random
0.831417314335 class_median
0.831406519862 class_mode
0.831260872741 pybrain
0.830386442142 mode
0.830057344238 deleted
0.826214378238 mean
0.824984458512 knn_for_miss_val
0.822804733204 closest_fit
0.822798794587 svm
0.822085492228 median
0.819543989993 lr
CPU times: user 7min 35s, sys: 72 ms, total: 7min 35s
Wall time: 7min 35s


In [37]:
results = {}
pybrain_classification(X_lr, y_lr, results, 'lr')
print results['lr']

0.820502247979


###Random Forest

In [50]:
%%time
results = {}

random_forest_classification(X_deleted, y_deleted, results, 'deleted', 50, 80, 3)
random_forest_classification(X_random, Y, results, 'random', 50, 80, 3)
random_forest_classification(X_mean, Y, results, 'mean', 50, 80, 3)
random_forest_classification(X_median, Y, results, 'median', 50, 80, 3)
random_forest_classification(X_mode, Y, results, 'mode', 50, 80, 3)
random_forest_classification(X_class_mode, Y, results, 'class_mode', 50, 80, 3)
random_forest_classification(X_class_mean, Y, results, 'class_mean', 50, 80, 3)
random_forest_classification(X_class_median, Y, results, 'class_median', 50, 80, 3)
random_forest_classification(X_knn, y_knn, results, 'knn_for_miss_val', 50, 80, 3)
random_forest_classification(X_closest_fit, y_closest_fit, results, 'closest_fit', 50, 80, 3)
random_forest_classification(X_svm, y_svm, results, 'svm', 50, 80, 3)
random_forest_classification(X_lr, y_lr, results, 'lr', 50, 80, 3)
random_forest_classification(X_nb, y_nb, results, 'nb', 50, 80, 3)
random_forest_classification(X_pybrain, y_pybrain, results, 'pybrain', 50, 80, 3)
    
sort_and_print_results(results)

(0.81788104490500868, 53) class_mean
(0.81687715889464596, 53) class_median
(0.81167422279792745, 50) class_mode
(0.80554218313685344, 50) closest_fit
(0.80541342832469776, 56) median
(0.8023281275665568, 56) knn_for_miss_val
(0.8023281275665568, 50) nb
(0.80126442010032473, 77) lr
(0.80125755613126093, 50) mean
(0.80023747841105364, 53) mode
(0.8001775267553668, 50) svm
(0.79919041450777206, 50) random
(0.79694977364806585, 53) pybrain
(0.78421500919300491, 56) deleted
CPU times: user 2min 20s, sys: 276 ms, total: 2min 20s
Wall time: 2min 19s


In [39]:
results = {}
random_forest_classification(X_lr, y_lr, results, 'lr', 50, 80, 3)
print results['lr']

(0.80016995432555205, 77)


In [19]:
from numpy import argsort

most_important_features = argsort(rf.feature_importances_)[::-1]

index = 0
while index < len(list(X_deleted.columns)) and clf.feature_importances_[most_important_features[index]] > 0.01:
    print index + 1, X_deleted.columns.values[most_important_features[index]], clf.feature_importances_[most_important_features[index]]
    index += 1

1 BI-RADS 0.69723965102
2 Age 0.183219378639
3 Margin 0.0444547075758
4 Shape 0.0550552004878
5 Density 0.0200310622776


In [61]:
%%time
results = {}

SVMClassification(X_deleted, y_deleted, results, 'deleted')
SVMClassification(X_random, Y, results, 'random')
SVMClassification(X_mean, Y, results, 'mean')
SVMClassification(X_median, Y, results, 'median')
SVMClassification(X_mode, Y, results, 'mode')
SVMClassification(X_class_mode, Y, results, 'class_mode')
SVMClassification(X_class_mean, Y, results, 'class_mean')
SVMClassification(X_class_median, Y, results, 'class_median')
SVMClassification(X_knn, y_knn, results, 'knn_for_miss_val')
SVMClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
SVMClassification(X_pybrain, y_pybrain, results, 'pybrain')
SVMClassification(X_svm, y_svm, results, 'svm')
SVMClassification(X_lr, y_lr, results, 'lr')
SVMClassification(X_nb, y_nb, results, 'nb')
    
sort_and_print_results(results)

(0.83871437823834183, 'LinearSVC') class_mode
(0.83767810880829019, 'LinearSVC') class_median
(0.83455310880829026, 'SVC') class_mean
(0.83358060725522143, 'SVC') knn_for_miss_val
(0.83034905171651319, 'SVC') nb
(0.83034317627585508, 'SVC') closest_fit
(0.83020272989912469, 'SVC') svm
(0.83001370752470949, 'SVC') deleted
(0.82936096718480135, 'SVC') median
(0.82936096718480135, 'SVC') mode
(0.8293555699481866, 'SVC') mean
(0.8282043263459814, 'SVC') lr
(0.82804644216408929, 'SVC') pybrain
(0.82623596718480141, 'LinearSVC') random
CPU times: user 8.94 s, sys: 12 ms, total: 8.95 s
Wall time: 8.95 s


In [40]:
results = {}
SVMClassification(X_lr, y_lr, results, 'lr')
print results['lr']

(0.82805219226471605, 'SVC')


In [62]:
%%time
results = {}

logisticRegClassification(X_deleted, y_deleted, results, 'deleted')
logisticRegClassification(X_random, Y, results, 'random')
logisticRegClassification(X_mean, Y, results, 'mean')
logisticRegClassification(X_median, Y, results, 'median')
logisticRegClassification(X_mode, Y, results, 'mode')
logisticRegClassification(X_class_mode, Y, results, 'class_mode')
logisticRegClassification(X_class_mean, Y, results, 'class_mean')
logisticRegClassification(X_class_median, Y, results, 'class_median')
logisticRegClassification(X_knn, y_knn, results, 'knn_for_miss_val')
logisticRegClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
logisticRegClassification(X_pybrain, y_pybrain, results, 'pybrain')
logisticRegClassification(X_lr, y_lr, results, 'lr')
logisticRegClassification(X_nb, y_nb, results, 'nb')
logisticRegClassification(X_svm, y_svm, results, 'svm')
    
sort_and_print_results(results)

(0.83766731433506048, 'sag') class_mode
(0.83766731433506048, 'sag') class_median
(0.83246977547495682, 'sag') class_mean
(0.82726144214162356, 'sag') mean
(0.82606566594645137, 'sag') knn_for_miss_val
(0.82518890328151995, 'sag') median
(0.8249845848653703, 'sag') closest_fit
(0.82414723661485323, 'sag') random
(0.8228282349670849, 'liblinear') lr
(0.82282242270320816, 'sag') nb
(0.82163532861825084, 'sag') pybrain
(0.82057156000230003, 'sag') svm
(0.81999136442141629, 'sag') mode
(0.81681128433166528, 'sag') deleted
CPU times: user 2.31 s, sys: 5 µs, total: 2.31 s
Wall time: 2.31 s


In [41]:
results = {}
logisticRegClassification(X_lr, y_lr, results, 'lr')
print results['lr']

(0.82056005980104652, 'sag')


In [63]:
%%time
results = {}

NBClassification(X_deleted, y_deleted, results, 'deleted')
NBClassification(X_random, Y, results, 'random')
NBClassification(X_mean, Y, results, 'mean')
NBClassification(X_median, Y, results, 'median')
NBClassification(X_mode, Y, results, 'mode')
NBClassification(X_class_mode, Y, results, 'class_mode')
NBClassification(X_class_mean, Y, results, 'class_mean')
NBClassification(X_class_median, Y, results, 'class_median')
NBClassification(X_knn, y_knn, results, 'knn_for_miss_val')
NBClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
NBClassification(X_pybrain, y_pybrain, results, 'pybrain')
NBClassification(X_svm, y_svm, results, 'svm')
NBClassification(X_nb, y_nb, results, 'nb')
NBClassification(X_lr, y_lr, results, 'lr')
    
sort_and_print_results(results)

0.832464378238 class_mode
0.831422711572 class_median
0.826225172712 class_mean
0.816866364421 mean
0.815819300518 random
0.814783031088 median
0.814214396725 knn_for_miss_val
0.814148697602 svm
0.812715889465 mode
0.812058046827 closest_fit
0.81091714105 pybrain
0.810823347828 deleted
0.809895884664 lr
0.809895884664 nb
CPU times: user 241 ms, sys: 4 ms, total: 245 ms
Wall time: 245 ms


In [42]:
results = {}
NBClassification(X_lr, y_lr, results, 'lr')
print results['lr']

0.811992409867


###Датасет маркетинг

In [45]:
data = get_data('marketing.dat')
data[data.columns[:10]][:10]

Unnamed: 0,Sex,MaritalStatus,Age,Education,Occupation,YearsInSf,DualIncome,HouseholdMembers,Under18,HouseholdStatus
0,2,1,5,4,5,5,3,3,0,1
1,1,1,5,5,5,5,3,5,2,1
2,2,1,3,5,1,5,2,3,1,2
3,2,5,1,2,6,5,1,4,2,3
4,2,5,1,2,6,3,1,4,2,3
5,1,1,6,4,8,5,3,2,0,1
6,1,5,2,3,9,4,1,3,1,2
7,1,3,3,4,3,5,1,1,0,2
8,1,1,6,3,8,5,3,3,0,2
9,1,1,7,4,8,4,3,2,0,2


In [46]:
#выделям из данных Х и Y
columns = list(data.columns.values)
X_columns = columns[:-1]
y_column = columns[-1]
X = data[X_columns]
Y = data[y_column]

In [47]:
data.shape

(8993, 14)

In [48]:
#удалим объекты с пропущенными значениями

data_deleted = data.dropna()
X_deleted = data_deleted[columns[:-1]]
y_deleted = data_deleted[columns[-1]]
X_deleted[X_deleted.columns[:10]][:5]

Unnamed: 0,Sex,MaritalStatus,Age,Education,Occupation,YearsInSf,DualIncome,HouseholdMembers,Under18,HouseholdStatus
1,1,1,5,5,5,5,3,5,2,1
2,2,1,3,5,1,5,2,3,1,2
3,2,5,1,2,6,5,1,4,2,3
4,2,5,1,2,6,3,1,4,2,3
5,1,1,6,4,8,5,3,2,0,1


In [50]:
X_random, X_mean, X_median, X_mode, X_class_mean, X_class_median, X_class_mode = imputMissingValues(X, Y)

In [55]:
devided_features = devide_features_to_classifiable_and_regressiable(X, 10)
devided_features

{'class': ['Sex',
  'MaritalStatus',
  'Age',
  'Education',
  'Occupation',
  'YearsInSf',
  'DualIncome',
  'HouseholdMembers',
  'HouseholdStatus',
  'TypeOfHome',
  'EthnicClass',
  'Language'],
 'regr': ['Under18']}

In [29]:
%%time
#метод ближайших соседей
#для начала пойдем самым простым путем - для обучения воспользуемся только полностью заполненными объектами

X_knn = X.copy()

for column_name in devided_featres['class']:
    current_X_columns = copy(X_columns)
    current_X_columns.remove(column_name)
    
    #обучение
    current_y = X_deleted[column_name]
    current_X = X_deleted[current_X_columns]
    
    max_acc = 0
    best_neigh_num = 0
    epsilon = 0.005

    for neighbors_num in xrange(3, 20):
        knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = neighbors_num)
        #knn = KNeighborsClassifier(n_neighbors=neighbors_num)
        X_scaled = StandardScaler().fit_transform(current_X)
        mean_acc_score = cross_val_score(knn, X_scaled, current_y, cv=5).mean()

        if mean_acc_score > max_acc + epsilon:
            max_acc = mean_acc_score
            best_neigh_num = neighbors_num

    print max_acc, best_neigh_num
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X, current_X_columns, column_name)
            
    #применение knn
    if len(X_test.index) > 0:
        knn_for_missing_values = KNeighborsClassifier(metric = 'manhattan', n_neighbors=best_neigh_num)
        #knn_for_missing_values = KNeighborsClassifier(n_neighbors=best_neigh_num)
        scaler = StandardScaler().fit(current_X)
        knn_for_missing_values.fit(scaler.transform(X_train), y_train)
        y_test = knn_for_missing_values.predict(scaler.transform(X_test))
        X_test_indices = list(X_test.index.values)
        counter = 0
        for index in X_test_indices:
            X_knn.set_value(index, column_name, y_test[counter])
            counter += 1

0.593368092266 7
0.812099277237 12
0.520349121969 18
0.416078074667 17
0.524451596758 19
0.640485031123 17
0.868236122523 18
0.574761565253 9
0.80540714677 8
0.736900200179 19
0.709714643812 18
0.914922003089 8
CPU times: user 10min 58s, sys: 236 ms, total: 10min 58s
Wall time: 10min 58s


In [30]:
#для X_knn соберем y_knn
X_knn = X_knn.dropna()
y_knn = pd.Series()
for index in list(X_knn.index.values):
    y_knn.set_value(index, Y.loc[index])

In [61]:
%%time
#svm

X_svm = X.copy()

for column_name in devided_features['class']:
    print column_name
    current_X_columns = list(X_svm.columns)
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X_svm, current_X_columns, column_name)
    
    clf = svm.SVC()
    
    print X_test[:10]
    
    scaler = StandardScaler().fit(X_train)
    
    clf.fit(scaler.transform(X_train), y_train)
    y_test = clf.predict(scaler.transform(X_test))
    
    addImputedValuesToDF(X_svm, y_test, list(X_test.index.values), column_name)

CPU times: user 856 µs, sys: 0 ns, total: 856 µs
Wall time: 818 µs


In [None]:
%%time
#svm
for column_name in devided_features['regr']:
    current_X_columns = list(X_svm.columns)
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X_svm, current_X_columns, column_name)
    
    clf = svm.SVR()
    
    scaler = StandardScaler().fit(X_train)
    
    clf.fit(scaler.transform(X_train), y_train)
    y_test = clf.predict(scaler.transform(X_test))
    
    addImputedValuesToDF(X_svm, y_test, list(X_test.index.values), column_name)

In [46]:
%%time
#closest_fit
X_closest_fit = X.copy()

max_min_differences = get_max_min_differences_array_from_data(X_deleted)

for column_name in devided_featres['class']:
    current_X_columns = copy(devided_featres['class'])
    current_X_columns.remove(column_name)
    
    #обучение
    current_y = X_deleted[column_name]
    current_X = X_deleted[current_X_columns]
    
    max_acc = 0
    best_neigh_num = 0
    epsilon = 0.005
    max_min_differences = get_max_min_differences_array_from_data(X_deleted)

    for neighbors_num in xrange(9, 10):
        knn = KNeighborsClassifier(n_neighbors=neighbors_num,
                                   metric=closest_fit_metric,
                                   metric_params={'max_min_differences':max_min_differences})
        X_scaled = StandardScaler().fit_transform(current_X)
        mean_acc_score = cross_val_score(knn, X_scaled, current_y, cv=5).mean()

        if mean_acc_score > max_acc + epsilon:
            max_acc = mean_acc_score
            best_neigh_num = neighbors_num

    print max_acc, best_neigh_num
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X, current_X_columns, column_name)
            
    max_min_differences = get_max_min_differences_array_from_data(current_X)
            
    #применение knn
    if len(X_test.index) > 0:
        knn_for_missing_values = KNeighborsClassifier(n_neighbors=best_neigh_num,
                                                      metric=closest_fit_metric,
                                                      metric_params={'max_min_differences':max_min_differences})
        scaler = StandardScaler().fit(current_X)
        knn_for_missing_values.fit(scaler.transform(X_train), y_train)
        y_test = knn_for_missing_values.predict(scaler.transform(X_test))
        X_test_indices = list(X_test.index.values)
        counter = 0
        for index in X_test_indices:
            X_closest_fit.set_value(index, column_name, y_test[counter])
            counter += 1

0.583624858825 9
0.803221980195 9
0.478034272526 9
0.389611846279 9
0.508308306696 9
0.616198852899 9
0.86081920705 9
0.438622308272 9
0.788100379288 9
0.697196662332 9
0.704047196427 9
0.91215540397 9
CPU times: user 1h 46min 37s, sys: 2.59 s, total: 1h 46min 40s
Wall time: 1h 46min 33s


In [47]:
#для X_closest_fit соберем y_closest_fit
X_closest_fit = X_closest_fit.dropna()
y_closest_fit = pd.Series()
for index in list(X_closest_fit.index.values):
    y_closest_fit.set_value(index, Y.loc[index])

In [85]:
%%time
from rep.estimators.pybrain import PyBrainClassifier

X_pybrain = X.copy()

for column_name in devided_featres['class']:
    current_X_columns = copy(X_columns)
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X, current_X_columns, column_name)
    fitted_y, values_map = fit_y(y_train)
    
    print 'i'
    
    pb_for_missing_values = PyBrainClassifier(layers=[10],
                                              epochs=10,
                                              verbose=False)
    pb_for_missing_values.fit(X_train, fitted_y)
    y_test_fitted = pb_for_missing_values.predict(X_test)
    
    y_test = decrypt_y(pd.Series(y_test_fitted), values_map)
    X_test_indices = list(X_test.index.values)
    counter = 0
    for index in X_test_indices:
        X_pybrain.set_value(index, column_name, y_test[counter])
        counter += 1

CPU times: user 15 µs, sys: 0 ns, total: 15 µs
Wall time: 21 µs


In [31]:
%%time
results = {}

neighbors(X_deleted, y_deleted, results, 'deleted', 10, 20)
neighbors(X_random, Y, results, 'random', 10, 20)
neighbors(X_mean, Y, results, 'mean', 10, 20)
neighbors(X_median, Y, results, 'median', 10, 20)
neighbors(X_mode, Y, results, 'mode', 10, 20)
neighbors(X_class_mode, Y, results, 'class_mode', 10, 20)
neighbors(X_class_mean, Y, results, 'class_mean', 10, 20)
neighbors(X_class_median, Y, results, 'class_median', 10, 20)
neighbors(X_knn, y_knn, results, 'knn_for_miss_val', 10, 20)
#neighbors(X_closest_fit, y_closest_fit, results, 'closest_fit')
#neighbors(X_pybrain, y_pybrain, results, 'pybrain')

sort_and_print_results(results)

(0.32315386165730742, 19) random
(0.32292632192448278, 17) class_mean
(0.32081489463761903, 15) class_median
(0.32011299048393993, 15) deleted
(0.31992538929589659, 14) class_mode
(0.31970168194882664, 15) mean
(0.31958976392611133, 16) mode
(0.318037265780517, 15) knn_for_miss_val
(0.31702672634669782, 16) median
CPU times: user 3min 44s, sys: 52.1 ms, total: 3min 44s
Wall time: 3min 44s


In [57]:
%%time
results = {}

pybrain_classification(X_deleted, y_deleted, results, 'deleted')
pybrain_classification(X_random, Y, results, 'random')
pybrain_classification(X_mean, Y, results, 'mean')
pybrain_classification(X_median, Y, results, 'median')
pybrain_classification(X_mode, Y, results, 'mode')
pybrain_classification(X_class_mode, Y, results, 'class_mode')
pybrain_classification(X_class_mean, Y, results, 'class_mean')
pybrain_classification(X_class_median, Y, results, 'class_median')
pybrain_classification(X_knn, y_knn, results, 'knn_for_miss_val')
#pybrain_classification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#pybrain_classification(X_pybrain, y_pybrain, results, 'pybrain')

sort_and_print_results(results)

0.335051473322 class_mode
0.332712386697 median
0.329375213613 class_median
0.328568720585 knn_for_miss_val
0.327817017688 deleted
0.32759595382 mode
0.32737051779 random
0.32292669009 mean
0.320923468048 class_mean
CPU times: user 1h 3min 44s, sys: 22 s, total: 1h 4min 6s
Wall time: 1h 4min 4s


In [93]:
%%time
results = {}

random_forest_classification(X_deleted, y_deleted, results, 'deleted', 50, 80, 3)
random_forest_classification(X_random, Y, results, 'random', 50, 80, 3)
random_forest_classification(X_mean, Y, results, 'mean', 50, 80, 3)
random_forest_classification(X_median, Y, results, 'median', 50, 80, 3)
random_forest_classification(X_mode, Y, results, 'mode', 50, 80, 3)
random_forest_classification(X_class_mode, Y, results, 'class_mode', 50, 80, 3)
random_forest_classification(X_class_mean, Y, results, 'class_mean', 50, 80, 3)
random_forest_classification(X_class_median, Y, results, 'class_median', 50, 80, 3)
random_forest_classification(X_knn, y_knn, results, 'knn_for_miss_val', 50, 80, 3)
#random_forest_classification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#random_forest_classification(X_pybrain, y_pybrain, results, 'pybrain')

sort_and_print_results(results)

(0.35393988659190201, 50) class_mean
(0.3161483502650132, 50) class_median
(0.31458475137807412, 71) mean
(0.31291839587113202, 50) class_mode
(0.31247431367499329, 53) median
(0.30979500828563067, 56) deleted
(0.30892161497327969, 50) random
(0.30769412481966396, 50) mode
(0.30367541867115727, 50) knn_for_miss_val
CPU times: user 6min 25s, sys: 2.68 s, total: 6min 28s
Wall time: 6min 28s


In [28]:
%%time
results = {}

SVMMultiClassification(X_deleted, y_deleted, results, 'deleted')

CPU times: user 1min 28s, sys: 76 ms, total: 1min 28s
Wall time: 1min 28s


In [33]:
%%time
results = {}

SVMMultiClassification(X_deleted, y_deleted, results, 'deleted')
SVMMultiClassification(X_random, Y, results, 'random')
SVMMultiClassification(X_mean, Y, results, 'mean')
SVMMultiClassification(X_median, Y, results, 'median')
SVMMultiClassification(X_mode, Y, results, 'mode')
SVMMultiClassification(X_class_mode, Y, results, 'class_mode')
SVMMultiClassification(X_class_mean, Y, results, 'class_mean')
SVMMultiClassification(X_class_median, Y, results, 'class_median')
SVMMultiClassification(X_knn, y_knn, results, 'knn_for_miss_val')
#SVMMultiClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#SVMMultiClassification(X_pybrain, y_pybrain, results, 'pybrain')

sort_and_print_results(results)

(0.3487205631652518, 'SVC') class_median
(0.34683297219219511, 'SVC') class_mode
(0.34282777008849835, 'SVC') median
(0.34082727537185054, 'SVC') mode
(0.33982572745227702, 'SVC') class_mean
(0.33974233309434421, 'SVC') deleted
(0.33965133151464255, 'SVC') knn_for_miss_val
(0.33926763811037264, 'SVC') mean
(0.33571158130496731, 'SVC') random
CPU times: user 19min 29s, sys: 100 ms, total: 19min 29s
Wall time: 19min 29s


In [39]:
%%time
results = {}

logisticRegClassification(X_deleted, y_deleted, results, 'deleted')
logisticRegClassification(X_random, Y, results, 'random')
logisticRegClassification(X_mean, Y, results, 'mean')
logisticRegClassification(X_median, Y, results, 'median')
logisticRegClassification(X_mode, Y, results, 'mode')
logisticRegClassification(X_class_mode, Y, results, 'class_mode')
logisticRegClassification(X_class_mean, Y, results, 'class_mean')
logisticRegClassification(X_class_median, Y, results, 'class_median')
logisticRegClassification(X_knn, y_knn, results, 'knn_for_miss_val')
#logisticRegClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#logisticRegClassification(X_pybrain, y_pybrain, results, 'pybrain')
    
sort_and_print_results(results)

(0.32325748303574403, 'sag') class_median
(0.32225853183349817, 'lbfgs') class_mode
(0.32114408181917264, 'lbfgs') class_mean
(0.31951772327233285, 'liblinear') deleted
(0.31780913866913701, 'liblinear') mean
(0.31769511913595094, 'newton-cg') median
(0.31733584325690184, 'lbfgs') knn_for_miss_val
(0.31691950546357772, 'sag') mode
(0.31547833657800639, 'sag') random
CPU times: user 1min 12s, sys: 7.98 ms, total: 1min 12s
Wall time: 1min 12s


In [93]:
%%time
results = {}

NBClassification(X_deleted, y_deleted, results, 'deleted')
NBClassification(X_random, Y, results, 'random')
NBClassification(X_mean, Y, results, 'mean')
NBClassification(X_median, Y, results, 'median')
NBClassification(X_mode, Y, results, 'mode')
NBClassification(X_class_mode, Y, results, 'class_mode')
NBClassification(X_class_mean, Y, results, 'class_mean')
NBClassification(X_class_median, Y, results, 'class_median')
#NBClassification(X_knn, y_knn, results, 'knn_for_miss_val')
#NBClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#NBClassification(X_pybrain, y_pybrain, results, 'pybrain')
    
sort_and_print_results(results)

0.308809936995 class_mode
0.308254073728 class_median
0.307753440903 deleted
0.307030302967 class_mean
0.305027827711 median
0.304805357132 random
0.304360045601 mean
0.304359302934 mode
CPU times: user 694 ms, sys: 0 ns, total: 694 ms
Wall time: 694 ms


In [3]:
def getNumAndCatFeatures(X):
    num_columns = []
    cat_columns = []
    
    for column in X.columns:
        if type(X[column].dropna().iloc[0]) == str:
            cat_columns.append(column)
        else:
            num_columns.append(column)
            
    return (num_columns, cat_columns)

def getMissingDataRate(df):
    
    N = df.shape[0] * df.shape[1]
    naValues = 0
    
    for label in df.columns:
        nanValInCol = df[label].isnull()
        if True in nanValInCol.value_counts().index:
            naValues += nanValInCol.value_counts()[True]
        
    return  float(naValues) / N

def setNanValuesToDataframe(df):
    df_copy = df.copy()
    rand_indices = random.sample(range(df_copy.shape[0]), df_copy.shape[0] / 2)
    for i in rand_indices:
        rand_cols = random.sample(df_copy.columns.values, max(1, df_copy.shape[1] / 3))
        for col in rand_cols:
            df_copy.set_value(i,
                              col,
                              None)
    return df_copy

###обработка датасета с кэгла

In [5]:
data = pd.read_csv('data_from_kaggle/kobe/data.csv')

In [3]:
data = data.drop(['shot_id'], axis=1)

In [8]:
getMissingDataRate(data.drop('shot_made_flag', 1))

0.0

###то же для титаника

In [43]:
data = pd.read_csv('data_from_kaggle/titanic/train.csv')

In [20]:
#to use
data = pd.read_csv('/home/tyamana/data_from_kaggle/titanic/train.csv')
data = data.drop('PassengerId', 1)
print(data.shape)
X = data[data.columns.values[1:]]
num_columns, _ = getNumAndCatfeatures(X)
X = data[num_columns]
Y = data[data.columns.values[0]]

0.03310886644219978

###приют для животных

In [35]:
data = pd.read_csv('data_from_kaggle/animal_shelters/train.csv')

In [39]:
data[:1] #пока не наш вариант

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White


###Don't get kicked! set

In [27]:
data = pd.read_csv('data_from_kaggle/dont_get_kicked/training.csv')

In [33]:
#touse
data = pd.read_csv('/home/tyamana/data_from_kaggle/dont_get_kicked/training.csv')
data = data.drop('RefId', axis=1)
data = data.sample(frac=0.09, random_state=410)
print(data.shape)
X = data[data.columns.values[1:]]
num_columns, _ = getNumAndCatfeatures(X)
X = data[num_columns]
Y = data[data.columns.values[0]]
setNanValuesToDataframe(X)

0.10723240115718419

###CASP(REGRESSION)

In [7]:
data = pd.read_csv('data_from_kaggle/chess/primary_training_part1.csv')

In [8]:
data.shape

(624148, 7)

In [69]:
getMissingDataRate(data)

0.0

In [65]:
data = data.drop('PTID', 1)

###bikesharing

In [26]:
data = pd.read_csv('/home/tyamana/data_from_kaggle/bikesharing/train.csv')
num_columns, _ = getNumAndCatFeatures(data)
data = data[num_columns]
data = data.sample(frac=0.1, random_state=510)
print(data.shape)
X = data[data.columns.values[:-1]]
Y = data[data.columns.values[-1]]
#X = setNanValuesToDataframe(X)

print(getMissingDataRate(X))

(1089, 11)
0.0


###scillcraft 

In [2]:
data = pd.read_csv('missing_values/datasets/scillcraft/train.csv', na_values='?')

In [3]:
data[:2]

Unnamed: 0,GameID,LeagueIndex,Age,HoursPerWeek,TotalHours,APM,SelectByHotkeys,AssignToHotkeys,UniqueHotkeys,MinimapAttacks,MinimapRightClicks,NumberOfPACs,GapBetweenPACs,ActionLatency,ActionsInPAC,TotalMapExplored,WorkersMade,UniqueUnitsMade,ComplexUnitsMade,ComplexAbilitiesUsed
0,52,5,27,10,3000,143.718,0.003515,0.00022,7,0.00011,0.000392,0.004849,32.6677,40.8673,4.7508,28,0.001397,6,0,0.0
1,55,5,23,10,5000,129.2322,0.003304,0.000259,4,0.000294,0.000432,0.004307,32.9194,42.3454,4.8434,22,0.001194,5,0,0.000208


In [27]:
data.shape

(3395, 20)

In [26]:
getMissingDataRate(data)

0.002474226804123711

###dowjones

In [9]:
data = pd.read_csv('missing_values/datasets/dowjones/dow_jones_index.data')

In [12]:
num, _ = getNumAndCatFeatures(data)

In [14]:
data = data[num]

In [15]:
Y = data['percent_change_next_weeks_price']
cols = list(data.columns.values)
cols.remove('percent_change_next_weeks_price')
X = data[cols]


(750, 8)

In [16]:
X.shape

(750, 7)

In [21]:
from sklearn.preprocessing import Imputer
X = Imputer().fit_transform(X)

In [23]:
rgr = KNeighborsRegressor(metric='manhattan', n_neighbors=10)

r2 = cross_val_score(rgr, X, Y, cv=5, scoring='r2')
r2.mean()

-0.12809962174541764

In [8]:
data[data.columns.values[10:]][:10]

Unnamed: 0,previous_weeks_volume,next_weeks_open,next_weeks_close,percent_change_next_weeks_price,days_to_next_dividend,percent_return_next_dividend
0,,$16.71,$15.97,-4.42849,26,0.182704
1,239655616.0,$16.19,$15.79,-2.47066,19,0.187852
2,242963398.0,$15.87,$16.13,1.63831,12,0.189994
3,138428495.0,$16.18,$17.14,5.93325,5,0.185989
4,151379173.0,$17.33,$17.37,0.230814,97,0.175029
5,154387761.0,$17.39,$17.28,-0.632547,90,0.172712
6,114691279.0,$16.98,$16.68,-1.76678,83,0.173611
7,80023895.0,$16.81,$16.58,-1.36823,76,0.179856
8,132981863.0,$16.58,$16.03,-3.31725,69,0.180941
9,109493077.0,$15.95,$16.11,1.00313,62,0.187149


In [37]:
getMissingDataRate(data)

0.005

###onlinenews

In [85]:
data = pd.read_csv('missing_values/datasets/onlinenews/train.csv')

In [86]:
num_col, _ = getNumAndCatFeatures(data)

In [87]:
data[num_col].shape

(39644, 60)

In [88]:
data = data[num_col]

In [89]:
data = data.sample(frac=0.1, random_state=282)

In [90]:
X = data[data.columns.values[:-1]]
Y = data[data.columns.values[-1]]

In [91]:
X_scaled = StandardScaler().fit_transform(X)

In [93]:
from sklearn.neighbors import KNeighborsRegressor as KNR

rgr = KNR(n_neighbors = 30, metric='manhattan')

r2 = cross_val_score(rgr, X_scaled, Y, cv=5, scoring='r2')
r2

array([-0.19714356, -0.14031447, -0.28846463, -0.24438042,  0.00309011])

In [58]:
y_pred[:5]

array([ 2832.3,  1496.1,  3252.8,  4344.4,  2059.3])

In [60]:
X_test[:5]

Unnamed: 0,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
15427,434,14,559,0.522388,1,0.680851,21,6,15,0,...,0.391771,0.1,1.0,-0.377381,-0.8,-0.05,0.560606,0.068182,0.060606,0.068182
5442,632,10,311,0.537954,1,0.645833,6,3,1,0,...,0.295195,0.05,0.7,-0.251389,-0.4,-0.155556,0.0,0.0,0.5,0.0
36866,55,10,413,0.612745,1,0.784387,10,1,1,2,...,0.212963,0.033333,0.5,-0.460185,-1.0,-0.1,0.0,0.0,0.5,0.0
25067,241,12,384,0.594086,1,0.722892,15,4,9,1,...,0.415909,0.136364,1.0,-0.491667,-1.0,-0.2,0.357143,0.0,0.142857,0.0
14361,455,11,111,0.693694,1,0.757143,4,1,0,1,...,0.221591,0.136364,0.25,-0.1,-0.1,-0.1,0.0,0.0,0.5,0.0


In [59]:
y_test[:5]

15427      843
5442      3300
36866      759
25067    21700
14361      762
Name:  shares, dtype: int64

###diabets

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('missing_values/datasets/diabets/pima-indians-diabetes.data', 
                    names=(['f{0}'.format(i) for i in range(8)] + ['target']), 
                    na_values=0)

In [4]:
data.shape

(768, 9)

In [16]:
#touse
data = pd.read_csv('/home/tyamana/missing_values/datasets/diabets/pima-indians-diabetes.data',
                    names=(['f{0}'.format(i) for i in range(8)] + ['target']),
                    na_values=0)
print(data.shape)
X = data[data.columns.values[:-1]]
Y = data[data.columns.values[-1]]
Y = Y.fillna(0)

###spam

In [4]:
data = pd.read_csv('missing_values/datasets/spambase/spambase.data', names=['f{0}'.format(i) for i in range(58)])
data = data.sample(frac=0.05, random_state=282)
data.shape

(230, 58)

In [9]:
#to use
data = pd.read_csv('/home/tyamana/missing_values/datasets/spambase/spambase.data', names=['f{0}'.format(i) for i in range(58)])
print(data.shape)
X = data[data.columns.values[:-1]]
Y = data[data.columns.values[-1]]
setNanValuesToDataframe(X)

0.0

###defaultCreditCard

In [14]:
def objectToGoodType(df):
    new_df = pd.DataFrame()
    for column in df.columns:
        new_df[column] = pd.Series([df[column].loc[i] for i in df.index], index=df.index)

    return new_df

In [11]:
raw_data = pd.read_excel('missing_values/datasets/defaultCreditCard/train.xls')
data = copy(raw_data[1:])
data.columns = raw_data.loc['ID']
data = data.sample(frac=0.012, random_state=410)

In [15]:
X = data[data.columns.values[:-1]]
X = objectToGoodType(X)
Y = data[data.columns.values[-1]]
Y = pd.Series([Y.loc[i] for i in Y.index], index=Y.index)

In [None]:
%%time

kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    clf = SVC(kernel=kernel)
    mean_acc = cross_val_score(clf, X, Y, cv=5).mean()
    print(mean_acc, kernel)

In [None]:
#touse
raw_data = pd.read_excel('/home/tyamana/missing_values/datasets/defaultCreditCard/train.xls')
data = copy(raw_data[1:])
data.columns = raw_data.loc['ID']
data = data.sample(frac=0.12, random_state=410)
print(data.shape)
X = data[data.columns.values[:-1]]
X = objectToGoodType(X)
Y = data[data.columns.values[-1]]
Y = pd.Series([Y.loc[i] for i in Y.index], index=Y.index)
setNanValuesToDataframe(X)

###eeg

In [6]:
data = pd.read_csv('/home/tyamana/missing_values/datasets/eeg/train.txt', 
                   names=(['f{0}'.format(i) for i in range(14)] + ['target']))

In [7]:
data[:3]

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,target
0,4329.23,4009.23,4289.23,4148.21,4350.26,4586.15,4096.92,4641.03,4222.05,4238.46,4211.28,4280.51,4635.9,4393.85,0
1,4324.62,4004.62,4293.85,4148.72,4342.05,4586.67,4097.44,4638.97,4210.77,4226.67,4207.69,4279.49,4632.82,4384.1,0
2,4327.69,4006.67,4295.38,4156.41,4336.92,4583.59,4096.92,4630.26,4207.69,4222.05,4206.67,4282.05,4628.72,4389.23,0


In [1]:
import pandas as pd

In [2]:
#to use
data = pd.read_csv('/home/tyamana/missing_values/datasets/eeg/train.txt',
                   names=(['f{0}'.format(i) for i in range(14)] + ['target']))
data = data.sample(frac=0.33, random_state=410)
print(data.shape)
X = data[data.columns.values[:-1]]
Y = data[data.columns.values[-1]]
setNanValuesToDataframe(X)

(4943, 15)


NameError: name 'setNanValuesToDataframe' is not defined

###seismic

In [None]:
data = pd.read_csv('/home/tyamana/missing_values/datasets/eeg/train.txt', 
                   names=(['f{0}'.format(i) for i in range(14)] + ['target']))

In [29]:
#to use in pycharm
data = pd.read_csv('/home/tyamana/missing_values/datasets/sismic/train.txt',
                   names=(['f{0}'.format(i) for i in range(18)] + ['target']))
X = data[data.columns.values[:-1]]
num_columns, _ = getNumAndCatfeatures(X)
X = data[num_columns]
Y = data[data.columns.values[-1]]
setNanValuesToDataframe(X)

###banknot

In [26]:
data = pd.read_csv('/home/tyamana/missing_values/datasets/banknot/train.txt',
                   names=(['f{0}'.format(i) for i in range(4)] + ['target']))

In [28]:
#to use
data = pd.read_csv('/home/tyamana/missing_values/datasets/banknot/train.txt',
                   names=(['f{0}'.format(i) for i in range(4)] + ['target']))
X = data[data.columns.values[:-1]]
Y = data[data.columns.values[-1]]
setNanValuesToDataframe(X)

###wilt

In [32]:
data = pd.read_csv('/home/tyamana/missing_values/datasets/wilt/wilt/training.csv')


Unnamed: 0,class,GLCM_pan,Mean_Green,Mean_Red,Mean_NIR,SD_pan
0,w,120.362774,205.5,119.395349,416.581395,20.676318
1,w,124.739583,202.8,115.333333,354.333333,16.707151
2,w,134.691964,199.285714,116.857143,477.857143,22.496712
3,w,127.946309,178.368421,92.368421,278.473684,14.977453
4,w,135.431548,197.0,112.690476,532.952381,17.604193


In [34]:
data['class'].value_counts()

n    4265
w      74
Name: class, dtype: int64

###segment

In [41]:
from sklearn.preprocessing import LabelEncoder
Y = data['target']
Y = LabelEncoder().fit_transform(Y)
pd.Series(Y).value_counts()

6    30
5    30
4    30
3    30
2    30
1    30
0    30
dtype: int64

In [42]:
data = pd.read_csv('/home/tyamana/missing_values/datasets/segment/train.txt',
                   names=(['target'] + ['f{0}'.format(i) for i in range(19)]))

###clave

In [None]:
data = pd.read_csv('/home/tyamana/missing_values/datasets/clave/train.txt',
                   names=(['target'] + ['f{0}'.format(i) for i in range(19)]))

###occ

In [43]:
data = pd.read_csv('/home/tyamana/missing_values/datasets/occ/train.txt')
data[:3]

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
3,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1


In [44]:
num, _ = getNumAndCatFeatures(data)
num

['Temperature', 'Humidity', 'Light', 'CO2', 'HumidityRatio', 'Occupancy']

In [45]:
#touse
data = pd.read_csv('/home/tyamana/missing_values/datasets/occ/train.txt')
print(data.shape)
X = data[data.columns.values[:-1]]
num_columns, _ = getNumAndCatfeatures(X)
X = data[num_columns]
Y = data[data.columns.values[-1]]
setNanValuesToDataframe(X)

0.0

###wave

In [47]:
data = pd.read_csv('/home/tyamana/missing_values/datasets/waves/waveform.data',
                   names=(['f{0}'.format(i) for i in range(21)] + ['target']))
data[:5]

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f12,f13,f14,f15,f16,f17,f18,f19,f20,target
0,-1.23,-1.56,-1.75,-0.28,0.6,2.22,0.85,0.21,-0.2,0.89,...,2.89,7.75,4.59,3.15,5.12,3.32,1.2,0.24,-0.56,2
1,-0.69,2.43,0.61,2.08,2.3,3.25,5.52,4.55,2.97,2.22,...,1.24,1.89,1.88,-1.34,0.83,1.41,1.78,0.6,2.42,1
2,-0.12,-0.94,1.29,2.59,2.42,3.55,4.94,3.25,1.9,2.07,...,2.5,0.12,1.41,2.78,0.64,0.62,-0.01,-0.79,-0.12,0
3,0.86,0.29,2.19,-0.02,1.13,2.51,2.37,5.45,5.45,4.84,...,2.58,1.4,1.24,1.41,1.07,-1.43,2.84,-1.18,1.12,1
4,1.16,0.37,0.4,-0.59,2.66,1.0,2.69,4.06,5.34,3.53,...,4.3,1.84,1.73,0.21,-0.18,0.13,-0.21,-0.8,-0.68,1


In [None]:
#touse
data = pd.read_csv('/home/tyamana/missing_values/datasets/waves/waveform.data',
                   names=(['f{0}'.format(i) for i in range(21)] + ['target']))
print(data.shape)
X = data[data.columns.values[:-1]]
Y = data[data.columns.values[-1]]
setNanValuesToDataframe(X)

###Датасет с кэгла

In [21]:
data = pd.DataFrame().from_csv('train.csv')

In [22]:
data.shape

(114321, 132)

In [23]:
data = data.sample(frac=0.01, random_state=410)
data.shape

(1143, 132)

In [24]:
#выделям из данных Х и Y
columns = list(data.columns.values)
X_columns = columns[1:]
y_column = columns[0]
X = data[X_columns]
Y = data[y_column]

num_columns, cat_columns = getNumAndCatfeatures(X)

In [25]:
X = data[num_columns]

In [26]:
getMissingDataRate(X)

0.3808195850518685

In [None]:
from sklearn.linear_model import LogisticRegression as lr

def logisticRegClassification(X, y, results, method_name):
    scaled_X = StandardScaler().fit_transform(X)
    
    newton_acc = cross_val_score(lr(solver='newton-cg'), scaled_X, y, cv=5).mean()
    lbfgs_acc = cross_val_score(lr(solver='lbfgs'), scaled_X, y, cv=5).mean()
    liblinear_acc = cross_val_score(lr(solver='liblinear'), scaled_X, y, cv=5).mean()
    sag_acc = cross_val_score(lr(solver='sag'), scaled_X, y, cv=5).mean()
    
    acc_map = {newton_acc: 'newton-cg', lbfgs_acc: 'lbfgs', liblinear_acc: 'liblinear', sag_acc: 'sag'}
    max_acc = max(newton_acc, lbfgs_acc, liblinear_acc, sag_acc)
    
    results[method_name] = (max_acc, acc_map[max_acc])

In [25]:
#удалим объекты с пропущенными значениями

data_deleted = data[num_columns + [y_column]].dropna()
X_deleted = data_deleted[data_deleted.columns[:-1]]
y_deleted = data_deleted[data_deleted.columns[-1]]

In [26]:
%%time
X_random, X_mean, X_median, X_mode, X_class_mean, X_class_median, X_class_mode = imputMissingValues(X, Y)

CPU times: user 36.5 s, sys: 201 ms, total: 36.7 s
Wall time: 37.2 s


In [27]:
devided_features = devide_features_to_classifiable_and_regressiable(X, 10)
#devided_features

In [28]:
%%time
#метод ближайших соседей для заполнения пропусков

X_knn = X.copy()

for column_name in devided_features['class']:
    current_X_columns = copy(list(X_knn.columns))
    current_X_columns.remove(column_name)
    
    #обучение
    current_y = X_deleted[column_name]
    current_X = X_deleted[current_X_columns]
    
    max_acc = 0
    best_neigh_num = 0
    epsilon = 0.005
    
    for neighbors_num in xrange(3, 20):
        knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = neighbors_num)
        #knn = KNeighborsClassifier(n_neighbors=neighbors_num)
        X_scaled = StandardScaler().fit_transform(current_X)
        mean_acc_score = cross_val_score(knn, X_scaled, current_y, cv=5).mean()

        if mean_acc_score > max_acc + epsilon:
            max_acc = mean_acc_score
            best_neigh_num = neighbors_num
    
    print max_acc, best_neigh_num
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X, current_X_columns, column_name)
            
    #применение knn
    if len(X_test.index) > 0:
        knn_for_missing_values = KNeighborsClassifier(metric = 'manhattan', n_neighbors=best_neigh_num)
        #knn_for_missing_values = KNeighborsClassifier(n_neighbors=best_neigh_num)
        scaler = StandardScaler().fit(current_X)
        knn_for_missing_values.fit(scaler.transform(X_train), y_train)
        y_test = knn_for_missing_values.predict(scaler.transform(X_test))
        X_test_indices = list(X_test.index.values)
        counter = 0
        for index in X_test_indices:
            X_knn.set_value(index, column_name, y_test[counter])
            counter += 1



0.946913093103 3
0.672933653846 6
0.657553225166 6
0.795131533066 7
CPU times: user 21.7 s, sys: 19.7 ms, total: 21.8 s
Wall time: 21.8 s




In [29]:
#для X_knn соберем y_knn
X_knn = X_knn.dropna()
y_knn = pd.Series()
for index in list(X_knn.index.values):
    y_knn.set_value(index, Y.loc[index])

In [30]:
%%time
#closest_fit
X_closest_fit = X.copy()

max_min_differences = get_max_min_differences_array_from_data(X_deleted)

for column_name in devided_features['class']:
    current_X_columns = copy(devided_features['class'])
    current_X_columns.remove(column_name)
    
    #обучение
    current_y = X_deleted[column_name]
    current_X = X_deleted[current_X_columns]
    
    max_acc = 0
    best_neigh_num = 0
    epsilon = 0.005
    max_min_differences = get_max_min_differences_array_from_data(X_deleted)

    for neighbors_num in xrange(9, 10):
        knn = KNeighborsClassifier(n_neighbors=neighbors_num,
                                   metric=closest_fit_metric,
                                   metric_params={'max_min_differences':max_min_differences})
        X_scaled = StandardScaler().fit_transform(current_X)
        mean_acc_score = cross_val_score(knn, X_scaled, current_y, cv=5).mean()

        if mean_acc_score > max_acc + epsilon:
            max_acc = mean_acc_score
            best_neigh_num = neighbors_num

    print max_acc, best_neigh_num
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X, current_X_columns, column_name)
            
    max_min_differences = get_max_min_differences_array_from_data(current_X)
            
    #применение knn
    if len(X_test.index) > 0:
        knn_for_missing_values = KNeighborsClassifier(n_neighbors=best_neigh_num,
                                                      metric=closest_fit_metric,
                                                      metric_params={'max_min_differences':max_min_differences})
        scaler = StandardScaler().fit(current_X)
        knn_for_missing_values.fit(scaler.transform(X_train), y_train)
        y_test = knn_for_missing_values.predict(scaler.transform(X_test))
        X_test_indices = list(X_test.index.values)
        counter = 0
        for index in X_test_indices:
            X_closest_fit.set_value(index, column_name, y_test[counter])
            counter += 1

0.948487896253 9
0.929785576923 9
0.915816650811 9
0.95153845018 9
CPU times: user 2min 10s, sys: 413 ms, total: 2min 10s
Wall time: 2min 9s


In [31]:
#для X_closest_fit соберем y_closest_fit
X_closest_fit = X_closest_fit.dropna()
y_closest_fit = pd.Series()
for index in list(X_closest_fit.index.values):
    y_closest_fit.set_value(index, Y.loc[index])

In [None]:
#isfinite error
%%time
from rep.estimators.pybrain import PyBrainClassifier

X_pybrain = X.copy()

for column_name in devided_features['class']:
    current_X_columns = copy(list(X_pybrain.columns))
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X, current_X_columns, column_name)
    fitted_y, values_map = fit_y(y_train)
    
    print 'i'
    
    pb_for_missing_values = PyBrainClassifier(layers=[10],
                                              epochs=10,
                                              verbose=False)
    pb_for_missing_values.fit(X_train, fitted_y)
    y_test_fitted = pb_for_missing_values.predict(X_test)
    
    y_test = decrypt_y(pd.Series(y_test_fitted), values_map)
    X_test_indices = list(X_test.index.values)
    counter = 0
    for index in X_test_indices:
        X_pybrain.set_value(index, column_name, y_test[counter])
        counter += 1

In [34]:
X_class_mode, Y_class_mode = dropnaAndAddTarget(X_class_mode, Y, y_column)
X_class_mean, Y_class_mean = dropnaAndAddTarget(X_class_mean, Y, y_column)
X_class_median, Y_class_median = dropnaAndAddTarget(X_class_median, Y, y_column)

In [35]:
%%time
results = {}

neighbors(X_deleted, y_deleted, results, 'deleted', 10, 20)
neighbors(X_random, Y, results, 'random', 10, 20)
neighbors(X_mean, Y, results, 'mean', 10, 20)
neighbors(X_median, Y, results, 'median', 10, 20)
#neighbors(X_mode, Y, results, 'mode', 10, 20)
neighbors(X_class_mode, Y_class_mode, results, 'class_mode', 10, 20)
neighbors(X_class_mean, Y_class_mean, results, 'class_mean', 10, 20)
neighbors(X_class_median, Y_class_median, results, 'class_median', 10, 20)
neighbors(X_knn, y_knn, results, 'knn_for_miss_val', 10, 20)
neighbors(X_closest_fit, y_closest_fit, results, 'closest_fit', 10, 20)
#neighbors(X_pybrain, y_pybrain, results, 'pybrain')

sort_and_print_results(results)

(0.74541867769861336, 19) median
(0.73842411706121192, 11) random
(0.73409021546725262, 17) closest_fit
(0.73409021546725262, 17) class_mean
(0.73409021546725262, 17) deleted
(0.73409021546725262, 17) knn_for_miss_val
(0.73409021546725262, 17) class_mode
(0.73409021546725262, 17) class_median
(0.73404198268597254, 11) mean
CPU times: user 19.2 s, sys: 7.57 ms, total: 19.2 s
Wall time: 19.2 s


In [140]:
%%time
results = {}

pybrain_classification(X_deleted, y_deleted, results, 'deleted')
pybrain_classification(X_random, Y, results, 'random')
pybrain_classification(X_mean, Y, results, 'mean')
pybrain_classification(X_median, Y, results, 'median')
#pybrain_classification(X_mode, Y, results, 'mode')
pybrain_classification(X_class_mode, Y_class_mode, results, 'class_mode')
pybrain_classification(X_class_mean, Y_class_mean, results, 'class_mean')
pybrain_classification(X_class_median, Y_class_median, results, 'class_median')
pybrain_classification(X_knn, y_knn, results, 'knn_for_miss_val')
pybrain_classification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#pybrain_classification(X_pybrain, y_pybrain, results, 'pybrain')

sort_and_print_results(results)

0.74893894124 mean
0.74891212748 random
0.741017390638 median
0.733943722151 closest_fit
0.727742171763 class_median
0.719990233779 class_mode
0.719965818226 class_mean
0.715314655436 deleted
0.705743758774 knn_for_miss_val
CPU times: user 4min 45s, sys: 287 ms, total: 4min 45s
Wall time: 4min 45s


In [37]:
%%time
results = {}

random_forest_classification(X_deleted, y_deleted, results, 'deleted', 50, 80, 3)
random_forest_classification(X_random, Y, results, 'random', 50, 80, 3)
random_forest_classification(X_mean, Y, results, 'mean', 50, 80, 3)
random_forest_classification(X_median, Y, results, 'median', 50, 80, 3)
#random_forest_classification(X_mode, Y, results, 'mode', 50, 80)
random_forest_classification(X_class_mode, Y_class_mode, results, 'class_mode', 50, 80, 3)
random_forest_classification(X_class_mean, Y_class_mean, results, 'class_mean', 50, 80, 3)
random_forest_classification(X_class_median, Y_class_median, results, 'class_median', 50, 80, 3)
random_forest_classification(X_knn, y_knn, results, 'knn_for_miss_val', 50, 80, 3)
random_forest_classification(X_closest_fit, y_closest_fit, results, 'closest_fit', 50, 80, 3)
#random_forest_classification(X_pybrain, y_pybrain, results, 'pybrain', 50, 80)
    
sort_and_print_results(results)

(0.74797045718122446, 71) closest_fit
(0.74654214734786062, 50) class_median
(0.74482085088201189, 59) knn_for_miss_val
(0.7433192943905268, 53) class_mode
(0.74174449124092046, 50) class_mean
(0.74172007568821341, 53) deleted
(0.74017084195204175, 53) mean
(0.73842028652417069, 68) median
(0.73754309354171466, 71) random
CPU times: user 3min 25s, sys: 283 ms, total: 3min 26s
Wall time: 3min 26s


In [None]:
%%time
results = {}

SVMClassification(X_deleted, y_deleted, results, 'deleted')
SVMClassification(X_random, Y, results, 'random')
SVMClassification(X_mean, Y, results, 'mean')
SVMClassification(X_median, Y, results, 'median')
#SVMClassification(X_mode, Y, results, 'mode')
SVMClassification(X_class_mode, Y_class_mode, results, 'class_mode')
SVMClassification(X_class_mean, Y_class_mean, results, 'class_mean')
SVMClassification(X_class_median, Y_class_median, results, 'class_median')
SVMClassification(X_knn, y_knn, results, 'knn_for_miss_val')
SVMClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#SVMClassification(X_pybrain, y_pybrain, results, 'pybrain')

sort_and_print_results(results)

In [38]:
%%time
results = {}

logisticRegClassification(X_deleted, y_deleted, results, 'deleted')
logisticRegClassification(X_random, Y, results, 'random')
logisticRegClassification(X_mean, Y, results, 'mean')
logisticRegClassification(X_median, Y, results, 'median')
#logisticRegClassification(X_mode, Y, results, 'mode')
logisticRegClassification(X_class_mode, Y_class_mode, results, 'class_mode')
logisticRegClassification(X_class_mean, Y_class_mean, results, 'class_mean')
logisticRegClassification(X_class_median, Y_class_median, results, 'class_median')
logisticRegClassification(X_knn, y_knn, results, 'knn_for_miss_val')
logisticRegClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#logisticRegClassification(X_pybrain, y_pybrain, resul ts, 'pybrain')
    
sort_and_print_results(results)

(0.73140657320156288, 'newton-cg') random
(0.72702826936336473, 'liblinear') median
(0.72701677775224083, 'sag') mean
(0.70261856802783362, 'sag') closest_fit
(0.70261856802783362, 'sag') class_mean
(0.70261856802783362, 'sag') deleted
(0.70261856802783362, 'sag') knn_for_miss_val
(0.70261856802783362, 'sag') class_mode
(0.70261856802783362, 'sag') class_median
CPU times: user 14.7 s, sys: 27.9 ms, total: 14.7 s
Wall time: 14.8 s




In [39]:
%%time
results = {}

NBClassification(X_deleted, y_deleted, results, 'deleted')
NBClassification(X_random, Y, results, 'random')
NBClassification(X_mean, Y, results, 'mean')
NBClassification(X_median, Y, results, 'median')
#NBClassification(X_mode, Y, results, 'mode')
NBClassification(X_class_mode, Y_class_mode, results, 'class_mode')
NBClassification(X_class_mean, Y_class_mean, results, 'class_mean')
NBClassification(X_class_median, Y_class_median, results, 'class_median')
NBClassification(X_knn, y_knn, results, 'knn_for_miss_val')
NBClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#NBClassification(X_pybrain, y_pybrain, results, 'pybrain')

#results['raw data'] = cross_val_score(GaussianNB(), X, Y, cv=5).mean()
    
sort_and_print_results(results)

0.63166704972 median
0.630793687275 mean
0.615931203555 random
0.57632912165 closest_fit
0.57632912165 class_mean
0.57632912165 deleted
0.57632912165 knn_for_miss_val
0.57632912165 class_mode
0.57632912165 class_median
CPU times: user 301 ms, sys: 0 ns, total: 301 ms
Wall time: 300 ms


###Датасет бэндс

In [94]:
data = get_data('bands.dat')
data[data.columns[-10:]][:10]

Unnamed: 0,Solvent_pct,Esa_voltage,ESA_amperage,Wax,Hardener,Roller_durometer,Density,Anode_ratio,Chrome_content,Band_type
0,36.4,0.0,0,2.5,1.0,34,40,105.0,100,band
1,38.5,0.0,0,2.5,0.7,34,40,105.0,100,noband
2,39.8,0.0,0,2.8,0.9,40,40,103.87,100,noband
3,38.8,0.0,0,2.5,1.3,40,40,108.06,100,noband
4,42.5,5.0,0,2.3,0.6,35,40,106.67,100,noband
5,37.6,5.0,0,2.5,0.8,40,40,103.87,100,noband
6,37.5,6.0,0,2.5,0.6,30,40,106.67,100,noband
7,37.5,6.0,0,2.5,1.1,30,40,106.67,100,noband
8,39.8,1.5,0,3.0,1.0,40,40,103.22,100,band
9,31.8,0.0,0,3.0,1.0,38,40,106.66,100,noband


In [95]:
data.columns

Index([u'Proof_cut', u'Viscosity', u'Caliper', u'Ink_temperature', u'Humifity',
       u'Roughness', u'Blade_pressure', u'Varnish_pct', u'Press_speed',
       u'Ink_pct', u'Solvent_pct', u'Esa_voltage', u'ESA_amperage', u'Wax',
       u'Hardener', u'Roller_durometer', u'Density', u'Anode_ratio',
       u'Chrome_content', u'Band_type'],
      dtype='object')

In [96]:
#заменим 'band' на 1
y_changed = pd.Series()
for index in data.index.values:
    if data['Band_type'].loc[index] == 'band':
        y_changed.set_value(index, np.int64(1))
    else:
        y_changed.set_value(index, np.int64(0))
        
data['Band_type'] = y_changed

In [97]:
#выделям из данных Х и Y
columns = list(data.columns.values)
X_columns = columns[:-1]
y_column = columns[-1]
X = data[X_columns]
Y = data[y_column]

In [98]:
#удалим объекты с пропущенными значениями

data_deleted = data.dropna()
X_deleted = data_deleted[columns[:-1]]
y_deleted = data_deleted[columns[-1]]
X_deleted[X_deleted.columns[-10:]][:5]

Unnamed: 0,Ink_pct,Solvent_pct,Esa_voltage,ESA_amperage,Wax,Hardener,Roller_durometer,Density,Anode_ratio,Chrome_content
0,50.5,36.4,0,0,2.5,1.0,34,40,105.0,100
1,54.9,38.5,0,0,2.5,0.7,34,40,105.0,100
3,55.6,38.8,0,0,2.5,1.3,40,40,108.06,100
4,57.5,42.5,5,0,2.3,0.6,35,40,106.67,100
5,53.8,37.6,5,0,2.5,0.8,40,40,103.87,100


In [99]:
X_random, X_mean, X_median, X_mode, X_class_mean, X_class_median, X_class_mode = imputMissingValues(X, Y, X_columns)

In [100]:
devided_featres = devide_features_to_classifiable_and_regressiable(X, 10)
devided_featres

{'class': ['ESA_amperage', 'Density', 'Chrome_content'],
 'regr': ['Proof_cut',
  'Viscosity',
  'Caliper',
  'Ink_temperature',
  'Humifity',
  'Roughness',
  'Blade_pressure',
  'Varnish_pct',
  'Press_speed',
  'Ink_pct',
  'Solvent_pct',
  'Esa_voltage',
  'Wax',
  'Hardener',
  'Roller_durometer',
  'Anode_ratio']}

In [101]:
%%time
#метод ближайших соседей
#для начала пойдем самым простым путем - для обучения воспользуемся только полностью заполненными объектами

X_knn = X.copy()

for column_name in devided_featres['class']:
    current_X_columns = copy(X_columns)
    current_X_columns.remove(column_name)
    
    #обучение
    current_y = X_deleted[column_name]
    current_X = X_deleted[current_X_columns]
    
    values_map = {}
    if column_name == 'ESA_amperage':
        current_y, values_map = fit_y(current_y)
    
    max_acc = 0
    best_neigh_num = 0
    epsilon = 0.005

    for neighbors_num in xrange(3, 20):
        knn = KNeighborsClassifier(metric = 'manhattan', n_neighbors = neighbors_num)
        #knn = KNeighborsClassifier(n_neighbors=neighbors_num)
        X_scaled = StandardScaler().fit_transform(current_X)
        mean_acc_score = cross_val_score(knn, X_scaled, current_y, cv=5).mean()

        if mean_acc_score > max_acc + epsilon:
            max_acc = mean_acc_score
            best_neigh_num = neighbors_num

    print max_acc, best_neigh_num
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X, current_X_columns, column_name)
    
    values_map = {}
    if column_name == 'ESA_amperage':
        y_train, values_map = fit_y(y_train)
        
    #применение knn
    if len(X_test.index) > 0:
        knn_for_missing_values = KNeighborsClassifier(metric = 'manhattan', n_neighbors=best_neigh_num)
        #knn_for_missing_values = KNeighborsClassifier(n_neighbors=best_neigh_num)
        
        scaler = StandardScaler().fit(current_X)
        
        knn_for_missing_values.fit(scaler.transform(X_train), y_train)
        
        y_test = knn_for_missing_values.predict(scaler.transform(X_test))
        if column_name == 'ESA_amperage':
            y_test = decrypt_y(y_test, values_map)
        
        X_test_indices = list(X_test.index.values)
        counter = 0
        for index in X_test_indices:
            X_knn.set_value(index, column_name, y_test[counter])
            counter += 1

0.994666666667 3
0.807692970112 5
0.964382533218 3
CPU times: user 5.57 s, sys: 4 ms, total: 5.58 s
Wall time: 5.58 s


In [102]:
#для X_knn соберем y_knn
X_knn = X_knn.dropna()
y_knn = pd.Series()
for index in list(X_knn.index.values):
    y_knn.set_value(index, Y.loc[index])

###Closest_fit

In [103]:
%%time
#closest_fit
X_closest_fit = X.copy()

max_min_differences = get_max_min_differences_array_from_data(X_deleted)

for column_name in devided_featres['class']:
    current_X_columns = copy(devided_featres['class'])
    current_X_columns.remove(column_name)
    
    #обучение
    current_y = X_deleted[column_name]
    current_X = X_deleted[current_X_columns]
    
    values_map = {}
    if column_name == 'ESA_amperage':
        current_y, values_map = fit_y(current_y)
    
    max_acc = 0
    best_neigh_num = 0
    epsilon = 0.005
    max_min_differences = get_max_min_differences_array_from_data(X_deleted)

    for neighbors_num in xrange(9, 10):
        knn = KNeighborsClassifier(n_neighbors=neighbors_num,
                                   metric=closest_fit_metric,
                                   metric_params={'max_min_differences':max_min_differences})
        X_scaled = StandardScaler().fit_transform(current_X)
        mean_acc_score = cross_val_score(knn, X_scaled, current_y, cv=5).mean()

        if mean_acc_score > max_acc + epsilon:
            max_acc = mean_acc_score
            best_neigh_num = neighbors_num

    print max_acc, best_neigh_num
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X, current_X_columns, column_name)
    values_map = {}
    if column_name == 'ESA_amperage':
        y_train, values_map = fit_y(y_train)
            
    max_min_differences = get_max_min_differences_array_from_data(current_X)
            
    #применение knn
    if len(X_test.index) > 0:
        knn_for_missing_values = KNeighborsClassifier(n_neighbors=best_neigh_num,
                                                      metric=closest_fit_metric,
                                                      metric_params={'max_min_differences':max_min_differences})
        
        scaler = StandardScaler().fit(current_X)
        
        knn_for_missing_values.fit(scaler.transform(X_train), y_train)
        
        y_test = knn_for_missing_values.predict(scaler.transform(X_test))
        if column_name == 'ESA_amperage':
            y_test = decrypt_y(pd.Series(y_test), values_map)
        
        X_test_indices = list(X_test.index.values)
        counter = 0
        for index in X_test_indices:
            X_closest_fit.set_value(index, column_name, y_test[counter])
            counter += 1

0.994666666667 9
0.8007048897 9
0.96167880209 9
CPU times: user 11.5 s, sys: 36.1 ms, total: 11.5 s
Wall time: 11.4 s


In [104]:
#для X_closest_fit соберем y_closest_fit
X_closest_fit = X_closest_fit.dropna()
y_closest_fit = pd.Series()
for index in list(X_closest_fit.index.values):
    y_closest_fit.set_value(index, Y.loc[index])

###PyBrain

In [110]:
data.columns

Index([u'Proof_cut', u'Viscosity', u'Caliper', u'Ink_temperature', u'Humifity',
       u'Roughness', u'Blade_pressure', u'Varnish_pct', u'Press_speed',
       u'Ink_pct', u'Solvent_pct', u'Esa_voltage', u'ESA_amperage', u'Wax',
       u'Hardener', u'Roller_durometer', u'Density', u'Anode_ratio',
       u'Chrome_content', u'Band_type'],
      dtype='object')

In [115]:
X_not_object = X.copy()
for column in X_not_object.columns:
    X_not_object[column] = pd.Series(X_not_object[column].values.astype(type(X_not_object[column].iloc[0])))

In [126]:
###ошибка isfinite
from rep.estimators.pybrain import PyBrainClassifier

X_pybrain = X_not_object.copy()

for column_name in devided_featres['class']:
    current_X_columns = copy(X_columns)
    current_X_columns.remove(column_name)
    
    current_X, X_train, y_train, X_test = get_X_and_y_by_column_name(X_not_object, current_X_columns, column_name)
    fitted_y, values_map = fit_y(y_train)
    
    pb_for_missing_values = PyBrainClassifier(layers=[10],
                                              epochs=7,
                                              verbose=False)
    pb_for_missing_values.fit(X_train, fitted_y)
    y_test_fitted = pb_for_missing_values.predict(X_test)
    
    y_test = decrypt_y(pd.Series(y_test_fitted), values_map)
    X_test_indices = list(X_test.index.values)
    counter = 0
    for index in X_test_indices:
        X_pybrain.set_value(index, column_name, y_test[counter])
        counter += 1

In [142]:
%%time
results = {}

neighbors(X_deleted, y_deleted, results, 'deleted', 10, 100)
neighbors(X_random, Y, results, 'random', 10, 100)
neighbors(X_mean, Y, results, 'mean', 10, 100)
neighbors(X_median, Y, results, 'median', 10, 100)
neighbors(X_mode, Y, results, 'mode', 10, 100)
neighbors(X_class_mode, Y, results, 'class_mode', 10, 100)
neighbors(X_class_mean, Y, results, 'class_mean', 10, 100)
neighbors(X_class_median, Y, results, 'class_median', 10, 100)
neighbors(X_knn, y_knn, results, 'knn_for_miss_val', 10, 100)
neighbors(X_closest_fit, y_closest_fit, results, 'closest_fit', 10, 100)
#neighbors(X_pybrain, y_pybrain, results, 'pybrain')

sort_and_print_results(results)

(0.65479452054794529, 38) closest_fit
(0.65479452054794529, 38) deleted
(0.65479452054794529, 38) knn_for_miss_val
(0.62966646660378978, 52) class_mode
(0.58905941867444045, 50) class_median
(0.57570093457943927, 68) class_mean
(0.57196261682242988, 34) mode
(0.56608076824144726, 76) random
(0.56421160936294257, 14) mean
(0.55548315184772346, 48) median
CPU times: user 35.3 s, sys: 7.86 ms, total: 35.3 s
Wall time: 35.4 s


In [129]:
%%time
results = {}

pybrain_classification(X_deleted, y_deleted, results, 'deleted')
pybrain_classification(X_random, Y, results, 'random')
pybrain_classification(X_mean, Y, results, 'mean')
pybrain_classification(X_median, Y, results, 'median')
pybrain_classification(X_mode, Y, results, 'mode')
pybrain_classification(X_class_mode, Y, results, 'class_mode')
pybrain_classification(X_class_mean, Y, results, 'class_mean')
pybrain_classification(X_class_median, Y, results, 'class_median')
pybrain_classification(X_knn, y_knn, results, 'knn_for_miss_val')
pybrain_classification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#pybrain_classification(X_pybrain, y_pybrain, results, 'pybrain')

sort_and_print_results(results)

0.660273972603 knn_for_miss_val
0.624657534247 deleted
0.594520547945 closest_fit
0.545691503044 random
0.536482894624 class_mean
0.534270770814 median
0.531012603961 class_mode
0.530909714482 class_median
0.51079482123 mode
0.460293235017 mean
CPU times: user 2min 51s, sys: 27.7 ms, total: 2min 51s
Wall time: 2min 51s


In [130]:
%%time
results = {}

random_forest_classification(X_deleted, y_deleted, results, 'deleted', 50, 80)
random_forest_classification(X_random, Y, results, 'random', 50, 80)
random_forest_classification(X_mean, Y, results, 'mean', 50, 80)
random_forest_classification(X_median, Y, results, 'median', 50, 80)
random_forest_classification(X_mode, Y, results, 'mode', 50, 80)
random_forest_classification(X_class_mode, Y, results, 'class_mode', 50, 80)
random_forest_classification(X_class_mean, Y, results, 'class_mean', 50, 80)
random_forest_classification(X_class_median, Y, results, 'class_median', 50, 80)
random_forest_classification(X_knn, y_knn, results, 'knn_for_miss_val', 50, 80)
random_forest_classification(X_closest_fit, y_closest_fit, results, 'closest_fit', 50, 80)
#random_forest_classification(X_pybrain, y_pybrain, results, 'pybrain', 50, 80)
    
sort_and_print_results(results)

(0.64784360799108287, 59) class_mode
(0.63835616438356158, 74) deleted
(0.63835616438356158, 65) knn_for_miss_val
(0.6217782731715682, 64) class_median
(0.61987481779987985, 70) class_mean
(0.61643835616438358, 50) closest_fit
(0.61250107176541191, 70) median
(0.61059761639372367, 54) mode
(0.60876275400840263, 52) mean
(0.53260739089428111, 57) random
CPU times: user 5min 8s, sys: 846 ms, total: 5min 9s
Wall time: 5min 8s


In [60]:
%%time
results = {}

SVMClassification(X_deleted, y_deleted, results, 'deleted')
SVMClassification(X_random, Y, results, 'random')
SVMClassification(X_mean, Y, results, 'mean')
SVMClassification(X_median, Y, results, 'median')
SVMClassification(X_mode, Y, results, 'mode')
SVMClassification(X_class_mode, Y, results, 'class_mode')
SVMClassification(X_class_mean, Y, results, 'class_mean')
SVMClassification(X_class_median, Y, results, 'class_median')
SVMClassification(X_knn, y_knn, results, 'knn_for_miss_val')
SVMClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#SVMClassification(X_pybrain, y_pybrain, results, 'pybrain')

sort_and_print_results(results)

(0.63287671232876719, 'SVC') closest_fit
(0.63287671232876719, 'SVC') deleted
(0.63287671232876719, 'SVC') knn_for_miss_val
(0.61250107176541202, 'SVC') class_mode
(0.57916488039097991, 'SVC') mode
(0.57525508016805271, 'SVC') random
(0.57372888622138385, 'SVC') class_median
(0.53838634999571289, 'SVC') median
(0.53288176283974964, 'SVC') class_mean
(0.51419017405470291, 'SVC') mean
CPU times: user 4.84 s, sys: 4.01 ms, total: 4.84 s
Wall time: 4.84 s


In [61]:
%%time
results = {}

logisticRegClassification(X_deleted, y_deleted, results, 'deleted')
logisticRegClassification(X_random, Y, results, 'random')
logisticRegClassification(X_mean, Y, results, 'mean')
logisticRegClassification(X_median, Y, results, 'median')
logisticRegClassification(X_mode, Y, results, 'mode')
logisticRegClassification(X_class_mode, Y, results, 'class_mode')
logisticRegClassification(X_class_mean, Y, results, 'class_mean')
logisticRegClassification(X_class_median, Y, results, 'class_median')
logisticRegClassification(X_knn, y_knn, results, 'knn_for_miss_val')
logisticRegClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#logisticRegClassification(X_pybrain, y_pybrain, resul ts, 'pybrain')
    
sort_and_print_results(results)

(0.56712328767123288, 'sag') closest_fit
(0.56712328767123288, 'sag') deleted
(0.56712328767123288, 'sag') knn_for_miss_val
(0.56632084369373226, 'sag') class_mode
(0.54759495841550199, 'lbfgs') class_median
(0.54389093715167625, 'sag') random
(0.52163251307553793, 'sag') class_mean
(0.51816856726399729, 'sag') mode
(0.5179284918117123, 'sag') median
(0.50117465489153734, 'sag') mean
CPU times: user 2.3 s, sys: 0 ns, total: 2.3 s
Wall time: 2.32 s




In [108]:
%%time
results = {}

NBClassification(X_deleted, y_deleted, results, 'deleted')
NBClassification(X_random, Y, results, 'random')
NBClassification(X_mean, Y, results, 'mean')
NBClassification(X_median, Y, results, 'median')
NBClassification(X_mode, Y, results, 'mode')
NBClassification(X_class_mode, Y, results, 'class_mode')
NBClassification(X_class_mean, Y, results, 'class_mean')
NBClassification(X_class_median, Y, results, 'class_median')
NBClassification(X_knn, y_knn, results, 'knn_for_miss_val')
NBClassification(X_closest_fit, y_closest_fit, results, 'closest_fit')
#NBClassification(X_pybrain, y_pybrain, results, 'pybrain')

#results['raw data'] = cross_val_score(GaussianNB(), X, Y, cv=5).mean()
    
sort_and_print_results(results)

0.507296578925 class_mode
0.497950784532 mode
0.492411900883 class_median
0.486804424248 median
0.481265540598 random
0.481196947612 class_mean
0.477527222841 mean
0.38904109589 closest_fit
0.38904109589 deleted
0.38904109589 knn_for_miss_val
CPU times: user 136 ms, sys: 4 ms, total: 140 ms
Wall time: 141 ms
