## Objective - Understand and Predicting Property Maintenance Fines

This project is based on a data challenge from the Michigan Data Science Team ([MDST](http://midas.umich.edu/mdst/)). 

[Blight violations](http://www.detroitmi.gov/How-Do-I/Report/Blight-Complaint-FAQs) are issued by the city to individuals who allow their properties to remain in a deteriorated condition

In the data provided, each row corresponds to a single blight ticket, and includes information about when, why, and to whom each ticket was issued. 

I first analyze the data to understanding when and why a resident might fail to comply with a blight ticket. 

I then create a model using gradient boosted decision trees to predict whether a given blight ticket will be paid on time.

In [1]:
import pandas as pd
import numpy as np

def blight_model():
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import roc_auc_score
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.naive_bayes import GaussianNB
    from sklearn.svm import SVC
    from sklearn.model_selection import GridSearchCV

    #import the data sets
    train = pd.read_csv('train.csv', encoding = "ISO-8859-1")
    test = pd.read_csv('test.csv', encoding = "ISO-8859-1")
    addresses = pd.read_csv('addresses.csv', encoding = "ISO-8859-1")

    #merge the addresses with the latitude/longitude file
    train = pd.merge(train,addresses, how = 'inner', left_on = 'ticket_id', right_on='ticket_id')
    test = pd.merge(test, addresses, how = 'inner', left_on = 'ticket_id', right_on='ticket_id')

    #remove the parts of the train data where compliance = NaN
    # subset: labels along other axis to consdier, e.g. if you are dropping rows these would be a list of columns to include
    train = train.dropna(subset = ['compliance'])

    #convert the compliance into labels for machine learning
    train['compliance'] = train['compliance'].astype(int)

    #create a dictionary of columns in the data that one wants to convert to category
    convert_columns={'country': 'category',
                     'non_us_str_code': 'category',
                     'compliance': 'category',
                     'state': 'category',
                     'zip_code': 'category'
                    }
    #conver the dictionary of columns
    for df in [test,train]:
        for col, col_type in convert_columns.items():
            if col in df:
                if col_type == 'category':
                    df[col] = df[col].replace(np.nan, "NA", regex = True).astype(col_type)
                elif col_type == 'int':
                    df[col] = df[col].replace(np.nan, 0, regex=True).astype(col_type)

    #remove the unneeded columns from X sets
    common_cols_to_drop = ['agency_name', 'inspector_name', 'mailing_address_str_number',
                               'violator_name', 'violation_street_number', 'violation_street_name',
                               'mailing_address_str_name', 'address', 'admin_fee', 'violation_zip_code',
                               'state_fee', 'late_fee', 'ticket_issued_date', 'hearing_date', 'violation_description',
                               'fine_amount', 'clean_up_cost', 'disposition', 'grafitti_status',
                               'violation_code', 'city']
    #create the column list to drop from training data
    train_cols_to_drop = ['payment_status', 'payment_date', 'balance_due', 'payment_amount'] + common_cols_to_drop


    #in the train data, drop to train_cols_to_drop
    train = train.drop(train_cols_to_drop, axis=1).set_index('ticket_id')

    # in the test data, drop common_cols_to_drop
    test = test.drop(common_cols_to_drop, axis=1).set_index('ticket_id')

    #get the train data labels
    y_train = train['compliance']

    #get the additional columns to drop from the training data
    X_train_cols_to_drop = ['compliance', 'compliance_detail', 'collection_status']

    #get the unlabeled training data
    train = train.drop(X_train_cols_to_drop, axis = 1)

    #get the category columns so that we can convert them to integers
    cat_columns = train.select_dtypes(['category']).columns

    #convert the category columns into categorical dummy variables
    for df in [test, train]:
        df[cat_columns] = df[cat_columns].apply(lambda x:x.cat.codes)

    #create the X_train data
    X_train = train.copy()

    #use a gradient boosted ensemble of decision trees
    grid_values = {'learning_rate': [0.01, 0.1, 1], 'max_depth': [3, 4, 5]}
    clf = GradientBoostingClassifier(random_state = 0)
    grid_clf_auc = GridSearchCV(clf, param_grid = grid_values, scoring = 'roc_auc')
    grid_clf_auc.fit(X_train, y_train)
    probs = grid_clf_auc.predict_proba(test)[:,1]
    result = pd.Series(probs, index = test.index)
    
    return result

In [2]:
blight_model()

  if self.run_code(code, result):


ticket_id
284932    0.205176
285362    0.094864
285361    0.228263
285338    0.205176
285346    0.228263
285345    0.308038
285347    0.173571
285342    0.608790
285530    0.279054
284989    0.048395
285344    0.176957
285343    0.094864
285340    0.094864
285341    0.176957
285349    0.181394
285348    0.145468
284991    0.106330
285532    0.119426
285406    0.119426
285001    0.119426
285006    0.094864
285405    0.065633
285337    0.439720
285496    0.176957
285497    0.205176
285378    0.039963
285589    0.107934
285585    0.205176
285501    0.228263
285581    0.092503
            ...   
376367    0.119426
376366    0.112573
376362    0.112573
376363    0.205176
376365    0.119426
376364    0.112573
376228    0.112573
376265    0.112573
376286    0.553783
376320    0.112573
376314    0.112573
376327    0.553783
376385    0.553783
376435    0.119426
376370    0.553783
376434    0.176957
376459    0.228263
376478    0.042838
376473    0.107662
376484    0.084503
376482    0.109218
37