# Classification

## Read file

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import scipy.spatial
import warnings
import sklearn as sk
import operator
import numpy as np
import pickle

import sklearn.preprocessing as prep
import sklearn.pipeline as pl
import sklearn.metrics as mt
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

import sklearn.tree as tree
import sklearn.svm as sv
import sklearn.neighbors as nei
import sklearn.decomposition as dc
import sklearn.neural_network as nn

warnings.simplefilter("ignore")

# Enable inline mode for matplotlib so that Jupyter displays graphs
%matplotlib inline

pd.__version__ #print which version of pandas you're using

'0.24.2'

In [2]:
df = pd.read_csv('numerical.csv')
df.head()

Unnamed: 0,EMPLOYMENT_END_DATE,TOTAL_WORKERS,NEW_EMPLOYMENT,CONTINUED_EMPLOYMENT,CHANGE_PREVIOUS_EMPLOYMENT,NEW_CONCURRENT_EMPLOYMENT,CHANGE_EMPLOYER,AMENDED_PETITION,FULL_TIME_POSITION,PREVAILING_WAGE,...,label,WAGE_LEVEL_1,WAGE_LEVEL_2,WAGE_LEVEL_3,WAGE_LEVEL_4,CBA,DBA,OES,SCA,OTHER_PW_SOURCE
0,-1.569764,-0.176444,1,0,0,0,0,0,1,-0.724217,...,0,1,0,0,0,0,0,1,0,0
1,-1.569764,0.016306,1,0,0,0,0,0,1,-0.134763,...,0,0,1,0,0,0,0,1,0,0
2,-0.348394,-0.176444,1,0,0,0,0,0,1,0.337822,...,0,0,0,1,0,0,0,1,0,0
3,-2.791135,-0.176444,0,0,0,0,1,0,1,1.231251,...,0,0,0,1,0,0,0,1,0,0
4,-2.791135,-0.176444,1,0,0,0,0,0,1,-0.717132,...,0,0,0,1,0,0,0,1,0,0


## Eliminate class imbalance

In [3]:
def balance_class(d, size=50000):
    gr = d.groupby('label')
    not_cert = pd.DataFrame()
    cert = pd.DataFrame()
    for name, data in gr:
        if name == 0:
            not_cert = data.sample(n = size)
        else:
            cert = data.sample(n = size)
    frames = [cert, not_cert]
    new_df = shuffle(pd.concat(frames, axis=0, sort=False, ignore_index=True))
    new_df = new_df.reset_index(drop=True)
    return new_df

bal_df = balance_class(df)
bal_df.head()

Unnamed: 0,EMPLOYMENT_END_DATE,TOTAL_WORKERS,NEW_EMPLOYMENT,CONTINUED_EMPLOYMENT,CHANGE_PREVIOUS_EMPLOYMENT,NEW_CONCURRENT_EMPLOYMENT,CHANGE_EMPLOYER,AMENDED_PETITION,FULL_TIME_POSITION,PREVAILING_WAGE,...,label,WAGE_LEVEL_1,WAGE_LEVEL_2,WAGE_LEVEL_3,WAGE_LEVEL_4,CBA,DBA,OES,SCA,OTHER_PW_SOURCE
0,0.872976,-0.176444,1,0,0,0,0,0,1,-0.684534,...,1,0,1,0,0,0,0,1,0,0
1,0.872976,0.594559,1,0,0,0,0,0,1,0.704813,...,1,0,1,0,0,0,0,1,0,0
2,0.872976,-0.176444,1,0,0,0,0,0,1,-0.340229,...,1,0,1,0,0,0,0,0,0,1
3,0.872976,-0.176444,1,0,0,0,0,0,1,-0.1178,...,0,0,1,0,0,0,0,0,0,1
4,-0.348394,-0.176444,1,0,0,0,0,0,1,-1.184744,...,1,1,0,0,0,0,0,1,0,0


## Create test set

In [4]:
test_set = df.sample(n=100000)
print(df.shape)
print(test_set.shape)
g = test_set.groupby('label')
for n, s in g:
    print(n, len(s))
test_labels = test_set['label']

(1092830, 26)
(100000, 26)
0 11805
1 88195


## Read models

In [5]:
dt = pickle.load(open('decision_tree.sav', 'rb'))
rf = pickle.load(open('random_forest.sav', 'rb'))
knn = pickle.load(open('knn.sav', 'rb'))
lr = pickle.load(open('lr.sav', 'rb'))
mlp = pickle.load(open('neural_net.sav', 'rb'))
ada = pickle.load(open('ada_boost.sav', 'rb'))

## Common functions

In [6]:
def split_label(d):
    labels = d['label']
    features = d.drop('label', axis=1)
    return labels, features

In [7]:
def best_param(feature, label, model, grid):
    # determine the best parameter
    #model_grid = ms.GridSearchCV(cv = 5, param_grid = grid, estimator = model)
    model_grid = ms.RandomizedSearchCV(cv = 2, param_distributions = grid, estimator = model, n_iter=3)
    model_grid = model_grid.fit(feature, label)
    print(model_grid.best_params_)
    
    # cross validation to evaluate best parameters
    acc = ms.cross_val_score(model_grid, feature, label, cv=2, scoring='accuracy')
    print("Accuracy = " + str(acc))

    return model_grid.best_params_

In [8]:
def print_report(data, model):
    labels, features = split_label(data)
    pred = ms.cross_val_predict(model, features, labels, cv = 5)
    print(mt.classification_report(labels, pred))

In [9]:
def store_model(clf, fname):
    final_model = clf
    filename = fname + '.sav'
    pickle.dump(final_model, open(filename, 'wb'))
    print('stored ' + fname)

In [31]:
predict_results = {}

def add_result(result, title):
    predict_results[title] = result

def store_results():
    predict_results['label'] = list(test_labels)
    results = pd.DataFrame(predict_results, index=None)
    results.to_csv('model_results.csv', index=False)
    
def read_results():
    results = pd.read_csv('model_results.csv')
    return results

def edit_results(results, col, name):
    results[name] = col
    return result

def restore_results(results):
    results.to_csv('model_results.csv', index=False)

## Decision Tree

In [10]:
def decision_tree(data):
    label, feature = split_label(data)
    dt = tree.DecisionTreeClassifier()
    grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': range(10, 50, 10),
        'min_samples_leaf': range(1, 10, 4),
        'min_samples_split': range(2, 11, 4),
    }
    param = best_param(feature, label, dt, grid)
    dt_clf = tree.DecisionTreeClassifier(criterion=param['criterion'],
                                        max_depth=param['max_depth'],
                                        min_samples_leaf=param['min_samples_leaf'],
                                        min_samples_split=param['min_samples_split'])
    dt_clf.fit(feature, label)
    return dt_clf

In [12]:
label, feature = split_label(test_set)
dt = decision_tree(bal_df)

In [13]:
dt_predict_result = dt.predict(feature)
print_report(test_set, dt)

              precision    recall  f1-score   support

           0       0.88      0.25      0.38     11805
           1       0.91      1.00      0.95     88195

    accuracy                           0.91    100000
   macro avg       0.89      0.62      0.67    100000
weighted avg       0.90      0.91      0.88    100000



In [14]:
store_model(dt, 'decision_tree')
print('model stored')
add_result(dt_predict_result, 'decision tree')

stored decision_tree
model stored


## Random Forest

In [13]:
import sklearn.ensemble as en

def random_forest(data):
    label, feature = split_label(data)
    rf = en.RandomForestClassifier()
    grid = {
        'n_estimators': range(10, 50, 10),
        'criterion': ['gini', 'entropy'],
        'max_depth': range(10, 50, 10),
        'min_samples_leaf': range(1, 10, 4),
        'min_samples_split': range(2, 11, 4),
    }
    param = best_param(feature, label, rf, grid)
    rf_clf = en.RandomForestClassifier(n_estimators=param['n_estimators'],
                                        criterion=param['criterion'],
                                        max_depth=param['max_depth'],
                                        min_samples_leaf=param['min_samples_leaf'],
                                        min_samples_split=param['min_samples_split'])
    rf_clf.fit(feature, label)
    return rf_clf

In [15]:
#rf = random_forest(bal_df)
label, feature = split_label(test_set)
rf_predict_result = rf.predict(feature)
print_report(test_set, rf)

              precision    recall  f1-score   support

           0       0.92      0.25      0.39     11805
           1       0.91      1.00      0.95     88195

    accuracy                           0.91    100000
   macro avg       0.91      0.62      0.67    100000
weighted avg       0.91      0.91      0.88    100000



In [16]:
store_model(rf, 'random_forest')
print('model stored')
add_result(rf_predict_result, 'random forest')

stored random_forest
model stored


## AdaBoost

In [16]:
import sklearn.ensemble as en

def ada_boost(data):
    label, feature = split_label(data)
    ada = en.AdaBoostClassifier()
    grid = {
        'n_estimators': range(10, 50, 10),
        'learning_rate': [0.5, 1.0, 1.5],
        'algorithm': ['SAMME']
    }
    param = best_param(feature, label, ada, grid)
    ada_clf = en.AdaBoostClassifier(n_estimators=param['n_estimators'],
                                        learning_rate=param['learning_rate'],
                                        algorithm=param['algorithm'])
    ada_clf.fit(feature, label)
    return ada_clf

In [17]:
ada = ada_boost(bal_df)

{'n_estimators': 30, 'learning_rate': 1.5, 'algorithm': 'SAMME'}


In [17]:
label, feature = split_label(test_set)
ada_predict_result = ada.predict(feature)
print_report(test_set, ada)

              precision    recall  f1-score   support

           0       0.98      0.23      0.37     11805
           1       0.91      1.00      0.95     88195

    accuracy                           0.91    100000
   macro avg       0.94      0.61      0.66    100000
weighted avg       0.92      0.91      0.88    100000



In [18]:
store_model(ada, 'ada_boost')
print('model stored')
add_result(ada_predict_result, 'ada boost')

model stored


## Nearest Neighbor

In [21]:
def nearest_neighbor(data):
    label, feature = split_label(data)
    pca = dc.PCA()
    nn = nei.KNeighborsClassifier(metric='euclidean')
    pipe = pl.Pipeline(steps = [('pca', pca), ('nn', nn)])
    grid = {
        'pca__n_components': range(8, 25, 8),
        'nn__n_neighbors': [5, 8, 11],
        'nn__algorithm': ['ball_tree', 'kd_tree', 'brute'],
        'nn__leaf_size': [20, 30, 40, 50]
    }
    param = best_param(feature, label, pipe, grid)
    nn_clf = nei.KNeighborsClassifier(n_neighbors=param['nn__n_neighbors'],
                                        algorithm=param['nn__algorithm'],
                                        leaf_size=param['nn__leaf_size'])
    pca_clf = dc.PCA(n_components=param['pca__n_components'])
    pipe_clf = pl.Pipeline(steps = [('pca', pca_clf), ('nn', nn_clf)])
    pipe_clf.fit(feature, label)
    return pipe_clf

In [22]:
knn = nearest_neighbor(bal_df)

{'pca__n_components': 24, 'nn__n_neighbors': 11, 'nn__leaf_size': 40, 'nn__algorithm': 'ball_tree'}


In [19]:
label, feature = split_label(test_set)
knn_predict_result = knn.predict(feature)
print_report(test_set, knn)

              precision    recall  f1-score   support

           0       0.98      0.23      0.37     11805
           1       0.91      1.00      0.95     88195

    accuracy                           0.91    100000
   macro avg       0.94      0.61      0.66    100000
weighted avg       0.92      0.91      0.88    100000



In [20]:
#store_model(ada, 'knn')
print('model stored')
add_result(knn_predict_result, 'nearest neighbor')

model stored


## SVC (PREDICTION TOO SLOW)

In [35]:
def svc(data):
    label, feature = split_label(data)
    pca = dc.PCA()
    svc = sv.SVC()
    pipe = pl.Pipeline(steps = [('pca', pca), ('svc', svc)])
    grid = {
        'pca__n_components': range(8, 25, 8),
        'svc__kernel': ['linear', 'rbf', 'poly']
    }
    param = best_param(feature, label, pipe, grid)
    sv_clf = sv.SVC(kernel=param['svc__kernel'])
    pca_clf = dc.PCA(n_components=param['pca__n_components'])
    pipe_clf = pl.Pipeline(steps = [('pca', pca_clf), ('svc', sv_clf)])
    pipe_clf.fit(feature, label)
    return pipe_clf

In [36]:
svc = svc(bal_df)

{'svc__kernel': 'rbf', 'pca__n_components': 16}


In [38]:
#label, feature = split_label(test_set)
#sv_predict_result = svc.predict(feature)
#print_report(test_set, svc)

In [39]:
store_model(svc, 'svc')
print('model stored')
#add_result(sv_predict_result, 'svc')

stored svc
model stored


## Neural Network

In [27]:
def neural_network(data):
    label, feature = split_label(data)
    mlp = nn.MLPClassifier()
    grid = {
        'hidden_layer_sizes': [(30,),(40,),(50,),(60,)],
        'activation': ['logistic', 'tanh', 'relu'],
        'solver': ['lbfgs', 'sgd', 'adam'],
        'learning_rate': ['constant', 'invscaling', 'adaptive']
    }
    param = best_param(feature, label, mlp, grid)
    mlp_clf = nn.MLPClassifier(hidden_layer_sizes=param['hidden_layer_sizes'],
                                        activation=param['activation'],
                                        solver=param['solver'],
                                        learning_rate=param['learning_rate'])
    mlp_clf.fit(feature, label)
    return mlp_clf

In [28]:
mlp = neural_network(bal_df)

{'solver': 'lbfgs', 'learning_rate': 'adaptive', 'hidden_layer_sizes': (60,), 'activation': 'logistic'}


In [21]:
label, feature = split_label(test_set)
nn_predict_result = mlp.predict(feature)
print_report(test_set, mlp)

              precision    recall  f1-score   support

           0       0.97      0.23      0.37     11805
           1       0.91      1.00      0.95     88195

    accuracy                           0.91    100000
   macro avg       0.94      0.61      0.66    100000
weighted avg       0.91      0.91      0.88    100000



In [22]:
#store_model(mlp, 'neural_net')
print('model stored')
add_result(nn_predict_result, 'neural network')

model stored


## Logistic Regression

In [31]:
import sklearn.linear_model as lm

def logistic_regression(data):
    label, feature = split_label(data)
    lr = lm.LogisticRegression()
    grid = {
        'penalty': ['l1', 'l2', 'elasticnet'],
        'max_iter': [50, 100, 200],
        'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
    }
    param = best_param(feature, label, lr, grid)
    lr_clf = lm.LogisticRegression(penalty=param['penalty'],
                                        max_iter=param['max_iter'],
                                        solver=param['solver'])
    lr_clf.fit(feature, label)
    return lr_clf

In [32]:
lr = logistic_regression(bal_df)

{'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 50}


In [23]:
label, feature = split_label(test_set)
lr_predict_result = lr.predict(feature)
print_report(test_set, lr)

              precision    recall  f1-score   support

           0       0.94      0.11      0.20     11805
           1       0.89      1.00      0.94     88195

    accuracy                           0.89    100000
   macro avg       0.92      0.56      0.57    100000
weighted avg       0.90      0.89      0.86    100000



In [24]:
#store_model(lr, 'lr')
print('model stored')
add_result(lr_predict_result, 'logistic regression')

model stored


## Combine results experiments

In [27]:
import collections
combine_result = []
for i in range(len(rf_predict_result)):
    results = collections.Counter([ada_predict_result[i], rf_predict_result[i], dt_predict_result[i]])
    combine_result.append(results.most_common(1)[0][0])
print('Accuracy: ' + str(mt.accuracy_score(label, combine_result)))
print('Precision: ' + str(mt.precision_score(label, combine_result)))
print('Recall: ' + str(mt.recall_score(label, combine_result)))

Accuracy: 0.7454059643311403
Precision: 0.9311957851161915
Recall: 0.7686881604630814


In [30]:
combine_result = []
for i in range(len(rf_predict_result)):
    if rf_predict_result[i] == 1 and dt_predict_result[i] == 1:
        combine_result.append(1)
    else:
        combine_result.append(0)
print('Accuracy: ' + str(mt.accuracy_score(label, combine_result)))
print('Precision: ' + str(mt.precision_score(label, combine_result)))
print('Recall: ' + str(mt.recall_score(label, combine_result)))

Accuracy: 0.6980445265960854
Precision: 0.9406422635179628
Recall: 0.7026338271004086


In [27]:
predict_results

{'decision tree': array([0, 1, 0, ..., 1, 1, 1]),
 'random forest': array([0, 1, 1, ..., 0, 0, 1]),
 'ada boost': array([1, 1, 0, ..., 1, 1, 1]),
 'nearest neighbor': array([1, 1, 0, ..., 1, 1, 1]),
 'neural network': array([0, 1, 1, ..., 1, 1, 1]),
 'logistic regression': array([0, 1, 0, ..., 1, 0, 1])}

In [32]:
store_results()