In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
from sklearn import grid_search
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn import svm
from sklearn import cross_validation
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler



In [2]:
df = pd.read_csv('./datasets/bank/bank-full.csv',delimiter=";")
df = df.replace(to_replace='unknown', value=np.nan).dropna()
df = shuffle(df)

In [3]:
# Auto encodes any dataframe column of type category or object.
def dummyEncode(df):
        columnsToEncode = list(df.select_dtypes(include=['category','object']))
        le = LabelEncoder()
        for feature in columnsToEncode:
            try:
                df[feature] = le.fit_transform(df[feature])
            except:
                print('Error encoding '+feature)
        return df

In [4]:
df = df.drop(['age','duration','campaign','pdays','previous','balance','day'], axis = 1)

In [5]:
df = dummyEncode(df)
df = df.as_matrix()
df.shape

(7842L, 10L)

In [6]:
X = df[:,:9]
Y = df[:,9]
print(X.shape)
print(Y.shape)

(7842L, 9L)
(7842L,)


In [7]:
logit_clf_total = []
knn_clf_total = []
gb_clf_total = []
rf_clf_total = []
svm_clf_total = []

In [8]:
def testAllClassifiers(X,Y,train_size):
    X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, train_size=train_size)

    scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
    X_train = scaling.transform(X_train)
    X_test = scaling.transform(X_test)

    k_list = [1]
    for i in range (1,26):
        k_list.append(i*int(round(train_size*len(X_train)/26)))

    # Logistic Regression
    logit_clf = LogisticRegressionCV(solver='newton-cg',Cs=[0.01,0.1,1.0,10.0,100.0])
    logit_clf.fit(X_train, Y_train)
    logit_clf_total.append(logit_clf.score(X_test, Y_test))

    # KNN
    knn_params = {"n_neighbors":k_list}
    clf = KNeighborsClassifier()
    knn_clf = grid_search.GridSearchCV(clf, knn_params, cv=3, n_jobs = 8)
    knn_clf.fit(X_train,Y_train);
    print knn_clf.best_score_
    print knn_clf.best_params_
    knn_clf_total.append(knn_clf.score(X_test, Y_test))

    # Gradient Boosting
    gb_params = {"n_estimators":[256,512,1024],
                "learning_rate":[.01,.1]}
    clf = GradientBoostingClassifier()
    gb_clf = grid_search.GridSearchCV(clf, gb_params, cv=3, n_jobs = 8)
    gb_clf.fit(X_train,Y_train);
    print gb_clf.best_score_
    gb_clf.best_params_
    gb_clf_total.append(gb_clf.score(X_test, Y_test))


    # Random Forests
    rf_params = {"max_features":[1,2,3,4,5,6,7,8,9],
                 "n_estimators":[256,512,1024]}
    clf = RandomForestClassifier( n_jobs = 4)
    rf_clf = grid_search.GridSearchCV(clf, rf_params, cv=3, n_jobs = 8)
    rf_clf.fit(X_train,Y_train)
    print rf_clf.best_score_
    rf_clf.best_params_
    rf_clf_total.append(rf_clf.score(X_test, Y_test))


    # SVM
    svm_params = {'C':[.001,.01,1,10,100]}
    clf = svm.SVC(kernel='linear')
    svm_clf = grid_search.GridSearchCV(clf, svm_params, cv=3, n_jobs = 8)
    svm_clf.fit(X_train,Y_train)
    print svm_clf.best_score_
    svm_clf.best_params_
    svm_clf_total.append(svm_clf.score(X_test, Y_test))

In [None]:
for i in range (0,3):
    testAllClassifiers(X,Y,0.2)
for i in range (0,3):
    testAllClassifiers(X,Y,0.5)
for i in range (0,3):
    testAllClassifiers(X,Y,0.8)

In [10]:
logit_mean_acc = sum(logit_clf_total)/len(logit_clf_total)
knn_mean_acc = sum(knn_clf_total)/len(knn_clf_total)
rf_mean_acc = sum(rf_clf_total)/len(rf_clf_total)
gb_mean_acc = sum(gb_clf_total)/len(gb_clf_total)
svm_mean_acc = sum(svm_clf_total)/len(svm_clf_total)

In [11]:
print(logit_mean_acc)
print(knn_mean_acc)
print(rf_mean_acc)
print(gb_mean_acc)
print(svm_mean_acc)

0.81893528849219
0.8205291679948996
0.7881734140898948
0.8208479438954415
0.8233981510997769
