In [2]:
%matplotlib inline

# Salary Predict Problem

** Consts **

In [3]:
from sklearn import svm
from sklearn import preprocessing
from sklearn.preprocessing import Imputer
from sklearn.feature_extraction import DictVectorizer
import numpy as np
import pandas as pd

# input file location
labeled_data_path = "data-box/salary.labeled.csv"
predict_data_path = "data-box/salary.2Predict.csv"

In [4]:
def load_data(data_path):
    # load input data
    dtypes = {
        'workclass': str,
        'education': str,
        'marital-status': str,
        'ocupation': str,
        'relationship': str,
        'race': str,
        'sex': str,
        'native-country': str,
        'label-str': str,
        'age': np.float64,
        'fnlwgt': np.float64,
        'e-num': np.float64,
        'capital-gain': np.float64,
        'capital-loss': np.float64,
        'hours-p-w': np.float64
    }

    data = pd.read_csv(
        data_path,
        header=None,
        index_col=False,
        na_values=['?'],
        skipinitialspace=True,
        names=[
            'age', 'workclass', 'fnlwgt', 'education', 'e-num',
            'marital-status', 'ocupation', 'relationship', 'race', 'sex',
            'capital-gain', 'capital-loss', 'hours-p-w', 'native-country',
            'label-str'
        ],
        engine='c',
        dtype=dtypes)
    return data


labeled_data = load_data(labeled_data_path).sample(frac=0.1)
predict_data = load_data(predict_data_path)
labeled_data.describe()
# pos: neg = 9305: 29537

Unnamed: 0,age,fnlwgt,e-num,capital-gain,capital-loss,hours-p-w
count,3884.0,3884.0,3884.0,3884.0,3884.0,3884.0
mean,38.460608,187328.473738,9.995881,952.218332,98.217559,40.525489
std,13.57414,104550.008604,2.554678,6974.574207,421.633994,12.551685
min,17.0,13492.0,1.0,0.0,0.0,1.0
25%,28.0,114510.0,9.0,0.0,0.0,40.0
50%,37.0,176255.0,10.0,0.0,0.0,40.0
75%,48.0,238180.75,12.0,0.0,0.0,45.0
max,90.0,914061.0,16.0,99999.0,2559.0,99.0


In [11]:
def extract_label_from_data(data):
    def f(r):
        if ">50K" in r:
            return 1
        else:
            return 0
    label = data["label-str"].astype(str).apply(lambda x: f(x)).values
    return label
label = extract_label_from_data(labeled_data)

## Solution A

In [8]:
def encode_onehot(df, cols):
    """
    One-hot encoding is applied to columns specified in a pandas DataFrame.
    
    Modified from: https://gist.github.com/kljensen/5452382
    
    Details:
    
    http://en.wikipedia.org/wiki/One-hot
    http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html
    
    @param df pandas DataFrame
    @param cols a list of columns to encode
    @return a DataFrame with one-hot encoding
    """
    df = df.fillna("NaN")

    vec = DictVectorizer()

    vec_data = pd.DataFrame(
        vec.fit_transform(df[cols].to_dict(orient='records')).toarray())
    vec_data.columns = vec.get_feature_names()
    vec_data.index = df.index

    df = df.drop(cols, axis=1)
    for col in cols:
        if col in vec_data.columns:
            vec_data.drop(col, axis=1, inplace=True)
    df = df.join(vec_data)
    return df

In [9]:
# Feature数据集
df_2_encode = labeled_data[['workclass', 'education', 'marital-status', 
                            'ocupation', 'relationship', 'race', 'sex', 
                            'native-country']]
df_2_miss = labeled_data[['age', 'fnlwgt', 'e-num', 'capital-gain', 
                          'capital-loss', 'hours-p-w']]

# category特征one-hot
df_2_encode_r = encode_onehot(df_2_encode, 
                              cols=['workclass', 'education', 
                                    'marital-status', 'ocupation', 
                                    'relationship', 'race', 'sex',
                                    'native-country'])

# missing
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
imp.fit(df_2_miss.astype(float).values)
df_2_miss_r = imp.transform(df_2_miss)

# 数值区间映射
min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))
df_2_miss_range_r = min_max_scaler.fit_transform(df_2_miss_r)

features = np.concatenate((df_2_miss_range_r, df_2_encode_r), axis=1)

In [12]:
# split input data to training-set and testing-set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2)

In [14]:
# train
clf = svm.SVC()
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [40]:
# predict test
y_pred = clf.predict(X_test)

In [136]:
# metric
from sklearn.metrics import classification_report
print classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.86      0.94      0.90      5965
          1       0.71      0.50      0.59      1804

avg / total       0.83      0.84      0.83      7769



In [77]:
# model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.3)

# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['accuracy']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring='%s' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()


# Tuning hyper-parameters for accuracy
()
Best parameters set found on development set:
()
{'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
()
Grid scores on development set:
()
0.745 (+/-0.001) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.745 (+/-0.001) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.745 (+/-0.001) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.745 (+/-0.001) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.745 (+/-0.001) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.745 (+/-0.001) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.740 (+/-0.008) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.745 (+/-0.001) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
0.740 (+/-0.008) for {'kernel': 'linear', 'C': 1}
0.738 (+/-0.012) for {'kernel': 'linear', 'C': 10}
0.738 (+/-0.012) for {'kernel': 'linear', 'C': 100}
0.738 (+/-0.012) for {'kernel': 'linear', 'C': 1000}
()
Detailed classification report:
()
The model is trained on the full development set.
The scores are 

## Solution B with pipeline

In [17]:
# create Pipeline & dump
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import classification_report
from sklearn.base import TransformerMixin    
from sklearn.externals import joblib

data = labeled_data.drop("label-str", axis=1).to_dict(orient='records')

pipe = Pipeline(steps=[('dictVectorizer', DictVectorizer(sparse=False)), 
                       ('imputer', Imputer(missing_values='NaN', strategy='mean', axis=0)), 
                       ('minMaxScaler', preprocessing.MinMaxScaler(feature_range=(-1, 1))),
                       ('svc', svm.SVC())])


estimator = pipe.fit(data, label)
joblib.dump(estimator, 'model/svc_pipe.pkl') 

pred = estimator.predict(data)
print classification_report(label, pred)

             precision    recall  f1-score   support

          0       0.86      0.95      0.90      2979
          1       0.75      0.49      0.59       905

avg / total       0.83      0.84      0.83      3884



In [None]:
# model selection
from sklearn.model_selection import GridSearchCV

tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
params = dict(svc__kernel=['rbf'], 
              svc__gamma=[1e-3, 1e-4],
              svc__C=[1, 10, 100, 1000])
grid_search = GridSearchCV(pipe, param_grid=params, cv=5, scoring='accuracy')
grid_search.fit(data, pred)

In [66]:
# load pipeline & predict
estimator_loaded = joblib.load('svc_pipe.pkl') 
predict_set = predict_data.drop("label-str", axis=1).to_dict(orient='records')
estimator.predict(predict_set)

array([0, 0, 0, ..., 0, 1, 0])

In [6]:
grid_search.cv_results_

NameError: name 'grid_search' is not defined