# Boston_Potholes_DataRobot_Model

Boston Pothole age predictor

In [1]:
!pip install datarobot

[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
import pandas_profiling as pdp
import matplotlib
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
df = pd.read_csv("../../data/311__Service_Requests.csv", parse_dates=True, encoding='UTF-8', infer_datetime_format=True)
print('Loaded 311 Service Requests RAW')
print('(Rows,Columns):{}'.format(df.shape))

Loaded 311 Service Requests RAW
(Rows,Columns):(973993, 33)


In [4]:
df.head(1)

Unnamed: 0,CASE_ENQUIRY_ID,OPEN_DT,TARGET_DT,CLOSED_DT,OnTime_Status,CASE_STATUS,CLOSURE_REASON,CASE_TITLE,SUBJECT,REASON,...,precinct,land_usage,LOCATION_STREET_NAME,LOCATION_ZIPCODE,Property_Type,Property_ID,LATITUDE,LONGITUDE,Source,Geocoded_Location
0,101000493034,10/16/2012 02:45:25 PM,10/18/2012 02:45:24 PM,10/17/2012 07:36:59 AM,ONTIME,Closed,Case Resolved,Sticker Request,Public Works Department,Recycling,...,1602,,36 Park St,2122.0,,,42.3011,-71.0543,Self Service,


In [5]:
# Find the PotHoles 
potdf = df.loc[df['TYPE'].str.contains('Pot')]
potdf.shape

(77677, 33)

In [6]:
# Only care about potholes reported by Citizen sources
non_employee_source = ['Citizens Connect App', 'Constituent Call', 'Self Service', 'Twitter']

# Fixed Potholes Data Parsing for Citizens
drdf = potdf.loc[(potdf['Source'].isin(non_employee_source)) ]



In [7]:
# Drop Unused Data Columns
drdf= drdf.drop('TARGET_DT', 1,errors='ignore')
drdf= drdf.drop('expected_age', 1,errors='ignore')
drdf= drdf.drop('SubmittedPhoto', 1,errors='ignore')
drdf= drdf.drop('ClosedPhoto', 1,errors='ignore') # cleaned_df['ClosedPhoto'][147]
drdf= drdf.drop('age_in_days', 1,errors='ignore') # cleaned_df['ClosedPhoto'][147]
drdf= drdf.drop('land_usage', 1,errors='ignore')
drdf= drdf.drop('Property_Type', 1,errors='ignore')
drdf= drdf.drop('Property_ID', 1,errors='ignore')
drdf= drdf.drop('age', 1,errors='ignore')
drdf= drdf.drop('performance', 1,errors='ignore')
drdf= drdf.drop('LATITUDE', 1,errors='ignore')
drdf= drdf.drop('LONGITUDE', 1,errors='ignore')
drdf= drdf.drop('Geocoded_Location', 1,errors='ignore')

# non_employee_source = ['Citizens Connect App','Constituent Call', 'Self Service', 'Twitter']
# drdf = drdf.loc[(potdf['CASE_STATUS'] == 'Closed' ) &(potdf['Source'].isin(non_employee_source)) ]
drdf = drdf.drop('CLOSURE_REASON',1,errors='ignore')

drdf= drdf.drop('CASE_STATUS', 1, errors='ignore') #we dont need this anymore
drdf.shape

(20853, 22)

In [8]:
# Compute Age

def get_closed_age(df):
    return pd.to_datetime(df['CLOSED_DT']) - pd.to_datetime(df['OPEN_DT'])

def get_todays_age(df):
    return pd.to_datetime(df['CLOSED_DT']) - pd.to_datetime(df['OPEN_DT'])

potdf['age'] = get_closed_age(df)

drdf= drdf.drop('CLOSED_DT', 1,errors='ignore')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [9]:
# Make Label

closedpotdf = potdf['age']
drdf['age_in_days'] = closedpotdf.apply(lambda x: x.days if isinstance(x, pd.Timedelta) else 0)
drdf['age_in_days'] = drdf['age_in_days'] #.dropna()# potdf['expected_age']

drdf['age_in_hours'] = closedpotdf.apply(lambda x: int(x.to_pytimedelta().total_seconds() /60 /60 ) if isinstance(x, pd.Timedelta) else 0)
drdf['age_in_hours'] = drdf['age_in_hours'] #.dropna()# potdf['expected_age']

In [10]:
drdf = drdf.loc[(potdf['CASE_STATUS'] == 'Closed' )] # filter out closed

In [11]:
# Regression: Age in Hours Label

drdfHours = drdf.loc[(potdf['CASE_STATUS'] == 'Closed' )] # filter out closed
drdf.head(10)
drdf.to_csv('../../data/dr_potholes_hours.csv')

In [12]:
# Binary Age Classifier (24, 48, 72, 168 hours old)

drdrAgeLabel = drdf.loc[(potdf['CASE_STATUS'] == 'Closed' )] # filter out closed
drdrAgeLabel= drdrAgeLabel.drop('age', 1,errors='ignore')
drdrAgeLabel= drdrAgeLabel.drop('age_in_days', 1,errors='ignore')

age_thresholds = [24, 48, 72, 96, 120, 144, 168]

for thresholds in age_thresholds:
    drdrAgeLabel['fast_fix'] = drdf['age_in_hours'] <= thresholds
    drdrAgeLabel= drdrAgeLabel.drop('age_in_hours', 1,errors='ignore')

    drdrAgeLabel.to_csv('../../data/dr_potholes_{}hrs.csv'.format(thresholds))

In [46]:
import datarobot as dr
dr.Client(token='VAB03kNIRA4tyHqvuvm_sJPmhKu0LVqQ', endpoint='https://uniapp.datarobot.com/api/v2')

<datarobot.rest.RESTClientObject at 0x7f11fe5419b0>

In [47]:
dr.Project.list()

[Project(Pothole_Fixed_In_24hr?),
 Project(Pothole_24hr_fix_model),
 Project(PotholeFixer_24hrs_old),
 Project(PotholeFixer_24hrs_old),
 Project(Pothole_24hr_fix_model),
 Project(PotholeFixer_24hrs_old),
 Project(Pothole_24hr_fix_model),
 Project(168hrFix-v1 Boston Potholes),
 Project(72hrFix-v1 Boston Potholes),
 Project(48hrFix-v1 Boston Potholes),
 Project(24hrFix-v1 Boston Potholes),
 Project(pots-v2 Accurate Boston Potholes),
 Project(pots-v1 Boston Potholes Age In Hours),
 Project(10k_),
 Project(cars)]

In [None]:
import time
projects = []
for threshold in age_thresholds:
    print(threshold)
    file = "../../data/dr_potholes_{}hrs.csv".format(threshold)
    print(file)
    name ="Pothole_Fixed_In_{}hr?".format(threshold)
    print(name)
    p = dr.Project.create(file, project_name=name)
    p.set_target('fast_fix')
    projects.append(p)

24
../../data/dr_potholes_24hrs.csv
Pothole_Fixed_In_24hr?
48
../../data/dr_potholes_48hrs.csv
Pothole_Fixed_In_48hr?


In [None]:
p.set_target('fast_fix')

In [22]:
p.advanced_options

{'response_cap': False, 'smart_downsampled': False}

In [43]:
file = "../../data/dr_potholes_{}hrs.csv".format(24)
name ="PotholeFixer_{}hrs_old".format(24)
print(file)
print(name)
p2 = dr.Project.start(file, "fast_fix", project_name=name)

../../data/dr_potholes_24hrs.csv
PotholeFixer_24hrs_old


ClientError: 422 client error: {'errors': {'target': 'is required'}, 'message': 'Invalid field data'}

In [29]:
p2.get_features()

[Feature(neighborhood),
 Feature(CASE_ENQUIRY_ID),
 Feature(LOCATION_ZIPCODE),
 Feature(QUEUE),
 Feature(Source),
 Feature(Location),
 Feature(city_council_district),
 Feature(Department),
 Feature(OPEN_DT (Year)),
 Feature(ward),
 Feature(LOCATION_STREET_NAME),
 Feature(neighborhood_services_district),
 Feature(Unnamed: 0),
 Feature(OPEN_DT (Day of Month)),
 Feature(OnTime_Status),
 Feature(fast_fix),
 Feature(police_district),
 Feature(CASE_TITLE),
 Feature(OPEN_DT (Day of Week)),
 Feature(OPEN_DT),
 Feature(REASON),
 Feature(precinct),
 Feature(pwd_district),
 Feature(TYPE),
 Feature(OPEN_DT (Month)),
 Feature(fire_district),
 Feature(SUBJECT)]

In [40]:
p2.get_featurelists()[0].id
p2.start?
