In [58]:
import pandas as pd
import numpy as np
from functools import partial

In [330]:
def load_data():
    '''
    Reads in several hundred csvs into pandas dataframes, concatenating them together.
    Then converts date fields into datetime format
    '''

    date_fields = [u'animalAdoptedDate','animalAvailableDate', u'animalBirthdate',
                u'animalFoundDate', u'animalKillDate',u'animalUpdatedDate']
    df = pd.read_csv('/Users/tracylee/raw_dogs/0.csv', parse_dates=date_fields)
    for i in xrange(1,100):
        df2 = pd.read_csv('/Users/tracylee/raw_dogs/{}.csv'.format(i), parse_dates=date_fields)
        df = pd.concat([df, df2])
    df = df.sort(['animalOrgID','animalID'])
    df = df.reset_index()
    return df

def orgs_of_interest(df):
    org_df = df['animalOrgID'].value_counts()
    org_df = pd.DataFrame(org_df)
    org_df = org_df.reset_index()
    org_df.columns = ['OrgID', 'count']
    orgs_of_interest = org_df[org_df['count']>=1000]['OrgID'].values
    return orgs_of_interest

def start_end_date(df):
    df['start_date'] = df.apply(start_date,axis=1)
    df['end_date'] = df.apply(end_date,axis=1)
    return df

def impute_dates_all(df, orgs_of_interest):
    temp_df = impute_dates_one(df,orgs_of_interest[0])

    for org in orgs_of_interest[1:]:
        temp_df2 = impute_dates_one(df,org)
        temp_df = pd.concat([temp_df,temp_df2])

    return temp_df

def start_date(row):
    avail_date = row['animalAvailableDate']
    found_date = row['animalFoundDate']

    if (not pd.isnull(avail_date) and not pd.isnull(found_date)):
        date_diff = (avail_date - found_date)
        if (date_diff.days>-10 and date_diff.days<300):
            return avail_date
    elif not pd.isnull(avail_date):
        return avail_date
    elif pd.isnull(found_date)==False:
        return found_date
    else:
        return np.nan

def end_date(row):
    adopt_date = row['animalAdoptedDate']
    kill_date = row['animalKillDate']

    if not pd.isnull(adopt_date):
        return adopt_date
    elif not pd.isnull(kill_date):
        return kill_date
    else:
        return np.nan

def impute_dates_one(df, org):
    temp_df = df[df['animalOrgID']==org]
    dates = temp_df[pd.isnull(temp_df['start_date'])==False]['start_date'].values
    ids = temp_df[pd.isnull(temp_df['start_date'])==False]['animalID'].values
    
    if len(ids)<2:
        return None
    else:
        temp_df=temp_df[(temp_df['animalID']>ids.min()) & (temp_df['animalID']<ids.max())]
        temp_df['start_lower']=temp_df['animalID'].apply(lambda x: dates[np.argmax(x<ids)-1])
        temp_df['start_higher']=temp_df['animalID'].apply(lambda x: dates[np.argmax(x<=ids)])
        temp_df['start_diff'] = temp_df['start_higher'] - temp_df['start_lower']
        temp_df['start_diff'] = temp_df['start_diff'].apply(lambda x: x.days)
        temp_df['start_add'] = temp_df['start_diff']
        temp_df = temp_df[abs(temp_df['start_diff'])<=30]
        temp_df['start_add'] = pd.to_timedelta((temp_df['start_add']/2),unit='D')
        try:
            temp_df['start_date'] = temp_df['start_lower'] + temp_df['start_add']
        except:
            return None

    return temp_df
    

def middle_date(row):
    lower = min(row['start_lower'],row['start_higher'])
    diff = abs(row['start_higher'] - row['start_lower'])/2
    return lower + diff

def labels(df):
    df['killed'] = df[u'animalKillDate'].apply(lambda x: 1 if not pd.isnull(x) else 0)
    df['adopted']= df['animalStatus'].apply(lambda x: 1 if x=='Adopted' else 0)
    df['censored'] = df['animalStatus'].apply(lambda x: 1 if x=='Available' else 0)
    df['time_range'] = (df['end_date']-df['start_date'])
    df['time_range'] = df['time_range'].apply(lambda x: x.days)
    return df

In [331]:
df = load_data()



In [338]:
df.shape

(7084, 67)

In [333]:
orgs = orgs_of_interest(df)

In [334]:
df = start_end_date(df)

In [335]:
df = impute_dates_all(df, orgs)

In [337]:
df = labels(df)

TypeError: ufunc subtract cannot use operands with types dtype('O') and dtype('<M8[ns]')