## Imports

In [1]:
%pylab inline
%config InlineBackend.figure_format = 'retina'
import pandas as pd
import seaborn as sns

Populating the interactive namespace from numpy and matplotlib


## Preprocessing

In [2]:
%%time
# Open data files
path = "./data/"

train = pd.read_csv(path+'train.csv', encoding='iso-8859-1')[::]
test = pd.read_csv(path+'test.csv')
test_ticket_id = np.array(test['ticket_id'])

train = train.set_index('ticket_id')
test = test.set_index('ticket_id')

# Drop the violators who were found not responsible
train.dropna(subset=['compliance'], inplace=True)

# Drop some uninformative features
for column_name in ['inspector_name', 'violator_name',
                    'violation_zip_code', 'violation_street_number', 'violation_street_name',
                    'mailing_address_str_number', 'mailing_address_str_name', 'city',
                    'state', 'zip_code', 'non_us_str_code', 'country',
                    'violation_description',
                    'admin_fee', 'state_fee', 'late_fee']:
    test.drop(column_name, axis=1, inplace=True)

# Convert datetime columns into years/months/days
for column_name in ['ticket_issued_date', 'hearing_date']:
    print('Converting datetime to years/months/days...', column_name)
    
    # test
    day_time = pd.to_datetime(test[column_name])
    test.drop(column_name, axis=1, inplace=True)
    test[column_name+'_month'] = np.array(day_time.dt.month)
    test[column_name+'_year'] = np.array(day_time.dt.year)
    test[column_name+'_day'] = np.array(day_time.dt.day)
    test[column_name+'_dayofweek'] = np.array(day_time.dt.dayofweek)
    
    # train
    day_time = pd.to_datetime(train[column_name])
    train.drop(column_name, axis=1, inplace=True)
    train[column_name+'_month'] = np.array(day_time.dt.month)
    train[column_name+'_year'] = np.array(day_time.dt.year)
    train[column_name+'_day'] = np.array(day_time.dt.day)
    train[column_name+'_dayofweek'] = np.array(day_time.dt.dayofweek)

# Convert string columns to categorical
cols = test.select_dtypes(exclude=['float', 'int']).columns
len_train = len(train)
temp_concat = pd.concat((train[cols], test[cols]), axis=0)

# Some filtering on violation_code to make it more manageable
temp_concat['violation_code'] = temp_concat['violation_code'].apply(lambda x: x.split(' ')[0])
temp_concat['violation_code'] = temp_concat['violation_code'].apply(lambda x: x.split('(')[0])
temp_concat['violation_code'][temp_concat['violation_code'].apply(lambda x: x.find('-')<=0)] = np.nan

# Make all codes with < 10 occurrences null
counts = temp_concat['violation_code'].value_counts()
temp_concat['violation_code'][temp_concat['violation_code'].isin(counts[counts < 10].index)] = np.nan

for column_name in cols:
    print('Converting to categorical...', column_name, '# variables:', len(temp_concat[column_name].unique()))
    dummies = pd.get_dummies(temp_concat[column_name])
    temp_concat[dummies.columns] = dummies
    temp_concat.drop(column_name, axis=1, inplace=True)
    train.drop(column_name, axis=1, inplace=True)
    test.drop(column_name, axis=1, inplace=True)

train[temp_concat.columns] = temp_concat.loc[train.index]
test[temp_concat.columns] = temp_concat.loc[test.index]

features = list( test.columns )
target = ['compliance']

print("Number of features:", len(features))

# Train Set
X = train[features]
y = np.array(train[target]).ravel()

# Normalize
mn = X.mean()
std = X.std()
X = (X - mn)/std

X = X.replace([np.inf, -np.inf], np.nan)
X[pd.isnull(X)] = 0

# Submissions Set
Xtest = (test[features] - mn) / std
Xtest = Xtest.replace([np.inf, -np.inf], np.nan)
Xtest[pd.isnull(Xtest)] = 0



Converting datetime to years/months/days... ticket_issued_date
Converting datetime to years/months/days... hearing_date
Converting to categorical... agency_name # variables: 5
Converting to categorical... violation_code # variables: 72
Converting to categorical... disposition # variables: 8
Converting to categorical... grafitti_status # variables: 2
Number of features: 97
CPU times: user 6.54 s, sys: 2.52 s, total: 9.07 s
Wall time: 8.86 s


## Define Models

In [3]:
# Define Gridesearch
from time import time
from scipy.stats import randint as sp_randint
from scipy.stats import norm

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingClassifier


# build a classifier
clf = GradientBoostingClassifier()

In [4]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print()

## Search

In [5]:
%%time
# specify parameters and distributions to sample from
param_dict = {
    'n_estimators': [120, 300, 500, 800, 1200],
    'max_depth': [5, 8, 15, 25, 30, None],
    'min_samples_split': [2, 5, 10, 15, 100],
    'min_samples_leaf': [2, 5, 10],
    'max_features' : ['sqrt', 'log2', None]
}

# run randomized search
n_iter_search = 5
random_search = GridSearchCV(clf, param_dict, n_jobs=-1)

start = time()
random_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

KeyboardInterrupt: 

In [6]:
target_clf = random_search
%%time
## Submission
def make_submission(model_name, clf):
    print('Fitting %s ...' % model_name)
    # Train classifier
    clf.fit(X, y)

    # Predict
    print('Generating Predictions ...')
    try:
        # For classifiers, we want the predicted probabilities (for label=True)
        y_pred = clf.predict_proba(Xtest)[:, 1]
    except AttributeError:
        # If regressor, get the scaled predictions
        y_pred = np.array(clf.predict(Xtest))
        # If output is not in range [0..1], this converts to [0..1]
        y_pred = y_pred - y_pred.min()
        y_pred = y_pred / y_pred.max()



    # Save to CSV
    print('Saving to CSV ...')
    df = {"ticket_id":test_ticket_id, "compliance":y_pred}
    df = pd.DataFrame(df, columns=["ticket_id", "compliance"])
    df.to_csv("./data/submission_%s.csv" % model_name, index=False)
    print('finished!')
    print('---------------------')

def make_all_submissions(classifiers):
    """classifiers: A dictionary of classifier name keys and sklearn classifier value pairs"""
    for model_name, clf in classifiers.items():
        make_submission(model_name, clf)

# Make all submissions
# make_all_submissions(classifiers)

# Make individual submission

make_submission("GradBoost_C_GRID", target_clf)

SyntaxError: invalid syntax (<ipython-input-6-4abf10c165ee>, line 2)


***Previous Saved Result for GradientBoostingClassifier***
```
RandomizedSearchCV took 28230.49 seconds for 400 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.900 (std: 0.050)
Parameters: {'max_depth': 2, 'min_samples_leaf': 3, 'min_samples_split': 4, 'n_estimators': 95, 'max_features': 1}

Model with rank: 2
Mean validation score: 0.890 (std: 0.064)
Parameters: {'max_depth': 2, 'min_samples_leaf': 8, 'min_samples_split': 6, 'n_estimators': 155, 'max_features': 1}

Model with rank: 3
Mean validation score: 0.880 (std: 0.067)
Parameters: {'max_depth': 3, 'min_samples_leaf': 9, 'min_samples_split': 7, 'n_estimators': 77, 'max_features': 1}

```