## Imports

In [None]:
%pylab inline
%config InlineBackend.figure_format = 'retina'
import pandas as pd
import seaborn as sns
pd.options.display.max_columns = 5000

## Preprocessing

Feature Engineering Todo:
- Are there any repeat violators?
- Violation descriptions -- are these valuable?
- Is the fee amount predictive or not?
    - It occurs to me that the answer is "Probably".

In [None]:
%%time
# Open data files
path = "./data/"

train = pd.read_csv(path+'train.csv', encoding='iso-8859-1')[::]
test = pd.read_csv(path+'test.csv')
test_ticket_id = np.array(test['ticket_id'])

train = train.set_index('ticket_id')
test = test.set_index('ticket_id')

# Drop the violators who were found not responsible
train.dropna(subset=['compliance'], inplace=True)

# Drop some uninformative features
for column_name in ['inspector_name', 'violator_name',
                    'violation_zip_code', 'violation_street_number', 'violation_street_name',
                    'mailing_address_str_number', 'mailing_address_str_name', 'city',
                    'state', 'zip_code', 'non_us_str_code', 'country',
                    'violation_description',
                    'admin_fee', 'state_fee', 'late_fee']:
    test.drop(column_name, axis=1, inplace=True)
    train.drop(column_name, axis=1, inplace=True)



# Convert datetime columns into years/months/days
for column_name in ['ticket_issued_date', 'hearing_date']:
    print('Converting datetime to years/months/days...', column_name)
    
    # test
    day_time = pd.to_datetime(test[column_name])
    test.drop(column_name, axis=1, inplace=True)
    test[column_name+'_month'] = np.array(day_time.dt.month)
    test[column_name+'_year'] = np.array(day_time.dt.year)
    test[column_name+'_day'] = np.array(day_time.dt.day)
    test[column_name+'_dayofweek'] = np.array(day_time.dt.dayofweek)
    
    # train
    day_time = pd.to_datetime(train[column_name])
    train.drop(column_name, axis=1, inplace=True)
    train[column_name+'_month'] = np.array(day_time.dt.month)
    train[column_name+'_year'] = np.array(day_time.dt.year)
    train[column_name+'_day'] = np.array(day_time.dt.day)
    train[column_name+'_dayofweek'] = np.array(day_time.dt.dayofweek)

# Convert string columns to categorical
cols = test.select_dtypes(exclude=['float', 'int']).columns
len_train = len(train)
temp_concat = pd.concat((train[cols], test[cols]), axis=0)

# Some filtering on violation_code to make it more manageable
temp_concat['violation_code'] = temp_concat['violation_code'].apply(lambda x: x.split(' ')[0])
temp_concat['violation_code'] = temp_concat['violation_code'].apply(lambda x: x.split('(')[0])
temp_concat['violation_code'][temp_concat['violation_code'].apply(lambda x: x.find('-')<=0)] = np.nan

# Make all codes with < 10 occurrences null
counts = temp_concat['violation_code'].value_counts()
temp_concat['violation_code'][temp_concat['violation_code'].isin(counts[counts < 10].index)] = np.nan

for column_name in cols:
    print('Converting to categorical...', column_name, '# variables:', len(temp_concat[column_name].unique()))
    dummies = pd.get_dummies(temp_concat[column_name])
    temp_concat[dummies.columns] = dummies
    temp_concat.drop(column_name, axis=1, inplace=True)
    train.drop(column_name, axis=1, inplace=True)
    test.drop(column_name, axis=1, inplace=True)

train[temp_concat.columns] = temp_concat.loc[train.index]
test[temp_concat.columns] = temp_concat.loc[test.index]

features = list( test.columns )
target = ['compliance']

print("Number of features:", len(features))

In [None]:
# TODO: Add this as a feature.
# from censusgeocode import CensusGeocode
# cg = CensusGeocode()

# def return_geoid(lon, lat):
#     # geoid = cg.address(address, city='Detroit', state='MI')
#     geoid = cg.coordinates(x=lon, y=lat)
#     try:
#        print(geoid[0]['Census Tracts'][0][u'GEOID'])
#     except:
#        print(lon, lat, geoid)
#     return geoid #int(geoid)

In [None]:
# Train Set
X = train[features]
y = np.array(train[target]).ravel()

# Normalize
mn = X.mean()
std = X.std()
X = (X - mn)/std

X = X.replace([np.inf, -np.inf], np.nan)
X[pd.isnull(X)] = 0

# Submissions Set
Xtest = (test[features] - mn) / std
Xtest = Xtest.replace([np.inf, -np.inf], np.nan)
Xtest[pd.isnull(Xtest)] = 0

## Evaluation

### Define Models

In [None]:
# Imports
from sklearn.linear_model import SGDClassifier, Perceptron
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble     import RandomForestClassifier
from sklearn.ensemble     import RandomForestRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

# Add classifiers
classifiers = {
    "GradBoost_C": GradientBoostingClassifier(learning_rate=0.05,
                                        max_features='auto',
                                        max_depth=2,
                                        min_samples_leaf=3,
                                        min_samples_split=4),
    'MLP_C_100x3': MLPClassifier(hidden_layer_sizes=(50, 50, 50, 50), learning_rate='adaptive'),
    "RF_R": RandomForestRegressor(n_estimators=25)
}

# Create Train/Test split for evaluation.
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

### Evaluate with K-Fold Cross Validation

In [None]:
# Imports / helper functions
from sklearn.metrics import roc_auc_score

def roc_auc_scorer(y_true, y_pred):
    return roc_auc_score(y_true, y_pred[:, 1])

def accuracy_with_rescaling(y, y_pred):
    """Potentially depricated now that we understand that we shouldn't be rounding output"""
    y_pred = np.round((y_pred - y_pred.min()) / y_pred.max())
    return sum(y_pred == y) / len(y)

In [None]:
%%time

scores = dict()
# Select the model
for classifier_type in classifiers.keys():
    # Train classifier
    clf = classifiers[classifier_type]
    # Score classifier
    # If using cross_val_score, there is no need for train test split.
    try:
        # For the classifiers -- needs_proba=True and alternative scorer for classifiers
        model_score = cross_val_score(clf, X, y, cv=5, n_jobs=-1, scoring=make_scorer(roc_auc_scorer, needs_proba=True))
    except AttributeError:
        # For the regressors
        model_score = cross_val_score(clf, X, y, cv=5, n_jobs=-1, scoring=make_scorer(roc_auc_score))
    # Record score
    scores[classifier_type] = model_score
    display(pd.DataFrame({classifier_type: scores[classifier_type]}))

scores = pd.DataFrame(data=scores)
display(scores.describe())

In [None]:
plt.figure()
sns.stripplot(data=scores)
plt.xticks(rotation=45)
plt.title('Comparison of Model Scores')

plt.figure()
sns.boxplot(data=scores)
plt.xticks(rotation=45)
plt.figure()
sns.barplot(data=scores)
plt.xticks(rotation=45)

In [None]:
%%time
## Submission
def make_submission(model_name, clf):
    print('Fitting %s ...' % model_name)
    # Train classifier
    clf.fit(X, y)

    # Predict
    print('Generating Predictions ...')
    try:
        # For classifiers, we want the predicted probabilities (for label=True)
        y_pred = clf.predict_proba(Xtest)[:, 1]
    except AttributeError:
        # If regressor, get the scaled predictions
        y_pred = np.array(clf.predict(Xtest))
        # If output is not in range [0..1], this converts to [0..1]
        y_pred = y_pred - y_pred.min()
        y_pred = y_pred / y_pred.max()



    # Save to CSV
    print('Saving to CSV ...')
    df = {"ticket_id":test_ticket_id, "compliance":y_pred}
    df = pd.DataFrame(df, columns=["ticket_id", "compliance"])
    df.to_csv("./data/submission_%s.csv" % model_name, index=False)
    print('finished!')
    print('---------------------')

def make_all_submissions(classifiers):
    """classifiers: A dictionary of classifier name keys and sklearn classifier value pairs"""
    for model_name, clf in classifiers.items():
        make_submission(model_name, clf)

# Make all submissions
make_all_submissions(classifiers)

# Make individual submission
# target_clf = "MLP_C_D"
# make_submission(target_clf, classifiers[target_clf])