In [42]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import re
import math
import matplotlib.pyplot as plt
import numpy as np

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import accuracy_score

def find_range_cols(data):
    range_col = [] # columns that fit numerical values into ranges
    for col in data.select_dtypes(exclude=['int64', 'float', 'float64']).columns:
        if any(item in data[col][0] for item in ['>', '<']): 
            range_col.append(col)
                
    return range_col
def findRange(thresholds, v):
    for i, th in enumerate(thresholds):
        if(v <= th):
            if i==0:
                    return "x<{}".format(th)
            elif i == len(thresholds)-1:
                    return "x>{}".format(thresholds[i-1])
            else:
                    return "{}<x<{}".format(thresholds[i-1], thresholds[i])
                
def convert_cate(arr):
    n = 4 #parts to be divided
    maxValue = max(arr)
    minValue = min(arr)
    thresholds = [ math.floor(i*(maxValue-minValue)/n)+minValue for i in range(n+1)]

    #print([findRange(thresholds, i) for i in arr])
    
    return pd.Series([findRange(thresholds, i) for i in arr])


def num2cate(dataIn):
    df = dataIn[:]
#     new_data = pd.DataFrame()
    for k in df.columns:
        if(k in df.select_dtypes(include=['int64','float64'])):
            values = pd.to_numeric(df[k])
            df[k] = convert_cate(values.tolist())
        
    return df
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - :term:`CV splitter`,
          - An iterable yielding (train, test) splits as arrays of indices.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : int or None, optional (default=None)
        Number of jobs to run in parallel.
        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
        for more details.

    train_sizes : array-like, shape (n_ticks,), dtype float or int
        Relative or absolute numbers of training examples that will be used to
        generate the learning curve. If the dtype is float, it is regarded as a
        fraction of the maximum size of the training set (that is determined
        by the selected validation method), i.e. it has to be within (0, 1].
        Otherwise it is interpreted as absolute sizes of the training sets.
        Note that for classification the number of samples usually have to
        be big enough to contain at least one sample from each class.
        (default: np.linspace(0.1, 1.0, 5))
    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()

class DataEncoder(object):
    def __init__(self, class_column='class', cat_columns=None):
        self.class_column = class_column
        self.cat_columns = cat_columns

        # these will be trained with fit_encoders()
        self.column_encoders = {} # label encoder
        self.cat_encoder = None # one-hot encoder
        self.label_encoder = None # label encoder

    def fit(self, data):
        """
        Fit one-hot encoders for categorical features and an integer encoder for
        the label. These can be used later to transform raw data into a form
        that ATM can work with.

        data: pd.DataFrame of unprocessed data
        """
        if self.class_column not in data.columns:
            raise KeyError('Class column "%s" not found in dataset!' %
                           self.class_column)
            
        range_col = find_range_cols(data)
                
        self.range_col = range_col
            

        # encode categorical columns, leave ordinal values alone
        if self.cat_columns is None:
            cats = data.drop([self.class_column]+range_col, axis=1).select_dtypes(exclude=['int64'])
            self.cat_columns = cats.columns
        else:
            cats = data[self.cat_columns].drop(range_col, axis=1).select_dtypes(exclude=['int64'])
            
        self.cat_cols = cats.columns
        
        for cat_name in cats.columns:   
        # save the indices of categorical columns for one-hot encoding

            # encode each feature as an integer in range(unique_vals)
            le = LabelEncoder()
            cats[cat_name] = le.fit_transform(cats[cat_name])
            self.column_encoders[cat_name] = le

        # One-hot encode the whole feature matrix.
        # Set sparse to False so that we can test for NaNs in the output
        self.cat_encoder = OneHotEncoder(n_values='auto',sparse=False)
        # if Category column exists          
        if cats.shape[1] != 0:
            self.cat_encoder.fit(cats)

        # Train an encoder for the label as well
        labels = np.array(data[[self.class_column]])
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit(labels)
        

    def transform(self, data):
        """
        Convert a DataFrame of labeled data to a feature matrix in the form
        that ATM can use.
        """
        y = self.transform_y(data)
        X = self.transform_x(data)

        return X, y
    
    def transform_x(self, data, onehot=False):
        """
        only transform x, for the generated data
        """
        cats = data[self.cat_columns]

        # encode each categorical feature as an integer
        for column, encoder in list(self.column_encoders.items()):
            cats[column] = encoder.transform(cats[column])

        # one-hot encode the categorical features
        if cats.shape[1] != 0 and onehot:
            X = self.cat_encoder.transform(cats)
        else:
            X = cats
        
        if self.class_column in data:
            nums = data.drop([self.class_column], axis=1).select_dtypes(include=['int64']).values
        else:
            nums = data.select_dtypes(include=['int64']).values
            
       
        # transform range cols into integrate. e.g., <4 -> 1; 4<x<7 -> 2
        ranges = []
        for col in self.range_col:
            values = data[col]
            ranges.append( self.range2int(values) )
#         print(X.shape, nums.shape, ranges.shape)
        if(ranges==[]):
            X = np.concatenate((X, nums), axis=1)
        else:
            ranges = np.transpose( np.array(ranges) )
            X = np.concatenate((X, nums, ranges), axis=1)
        return X
    
    def transform_y(self, data):
        if self.class_column in data:
            # pull labels into a separate series and transform them to integers
            labels = np.array(data[[self.class_column]])
            y = self.label_encoder.transform(labels)
            # drop the label column and transform the remaining features
        else:
            y = None
            
        return y
    
    def range2int(self, values):
        ranges = []
        for v in values:
            if v not in ranges:
                ranges.append(v)
        
        def sort_key(x):
            num_strings = re.findall('\d+', x)
            # 'undefined' is in the front
            if len(num_strings)==0:
                return -1
            # x> 1, x<7
            elif len(num_strings)==1:
                return int(num_strings[0])*2
            # 1<x<7
            else:
                nums = map(int, num_strings) # string to number
                return sum(nums)
                
        ranges.sort(key=sort_key)
        return list(map(lambda x: ranges.index(x), values))
        
        
    
    def fit_transform(self, data):
        """ Process data into a form that ATM can use. """
        self.fit(data)
        return self.transform(data)
    
def fit_model(model, data):
        '''
        Args:
            data(panda DataFrame): training dataset

        Return:
            model: an already trained sklearn model
            score(list<number>): cross validate score
        '''
        
        encoder = DataEncoder()
        encoder.fit(data)
        x, y = encoder.transform(data)

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
        
        
#         plot_learning_curve(model,'',x_train, y_train,ylim=(0.7, 1.01), cv=5, n_jobs=4)
        
#         score = plot_time_curve(model,x_train, x_test, y_train, y_test,3)
        
        model.fit(x_train, y_train)
        score = accuracy_score(y_test, model.predict(x_test))
        # score = cross_val_score(model, x_train, y_train, scoring='accuracy', cv=4) 
        return model, encoder, score

    

def plot_time_curve(model, x_train, x_test, y_train, y_test,epochNum):
    
    plotDataTrain = []
    plotDataTest = []
    for i in range(epochNum):
        model.fit(x_train,y_train)
        plotDataTrain.append(model.score(x_train,y_train))
        plotDataTest.append(model.score(x_test,y_test))
        print('epoch: ',i)
        
    x_axis = np.linspace(0,epochNum,epochNum)
    print(x_axis,plotDataTrain)
    plt.plot(x_axis,plotDataTrain,c='red')
    plt.plot(x_axis,plotDataTest,c='blue')
    plt.show()
    
    return plotDataTest[len(plotDataTest)-1]
    
    
xgb = XGBClassifier(
                max_depth=10, 
                learning_rate=0.1, 
                n_estimators=100,seed=10,
            )

knn = KNeighborsClassifier(
                algorithm = "ball_tree",
                leaf_size = 40,
                metric = "manhattan",
                n_neighbors = 10
            )
lr = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
                penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
                verbose=0, warm_start=False
            )

df = pd.read_csv('adult.csv')
df2 = pd.read_csv('bank.csv')

# df2 = df2.replace(' ?').dropna(axis=0)

# for i,var in df2.iterrows():
#     df2.at[i,'class'] = '1' if df2.at[i,'class']==' >50K' else '0'

# df3 = num2cate(df2)
# df2.to_csv('adult_new2.csv',index=False)

_,_,score = fit_model(xgb,df2)
score

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


0.8504254366323332