In [2]:
import pandas as pd, re, datetime as dt, numpy as np

In [3]:
## Handy stuff to break out into a module at some point

# Replace non-alpha with _ and lowercase column names
def clean_cols (df):
    df.columns = map (lambda x: re.sub(r'[^a-z]+', '_', x.lower().strip()), df.columns)
    
# Convert a date field
def conv_date(df, col):
    df[col] = pd.to_datetime(df[col])
    
def days_from_timedelta (td):
    return td.astype('timedelta64[D]') / np.timedelta64(1, 'D')


In [41]:
smallbiz = pd.read_csv("data/Small_Business_Express_Tabulated_Data (1).csv")
clean_cols(smallbiz)
conv_date(smallbiz, "contract_requirement_target_date")
conv_date(smallbiz, "contract_execution_date")
smallbiz.dtypes



fiscal_year                                                  int64
company                                                     object
address                                                     object
municipality                                                object
state                                                       object
zip_code                                                     int64
industry                                                    object
naics_code                                                  object
minority_or_women_owned                                     object
contract_execution_date                             datetime64[ns]
grant_amount                                                object
loan_amount                                                 object
total_assistance                                            object
total_project_cost                                          object
amount_leveraged                                            ob

In [42]:
smallbiz["grant_amount"] = smallbiz["grant_amount"].str.replace("$","")
smallbiz["loan_amount"] = smallbiz["loan_amount"].str.replace("$","")
smallbiz["total_assistance"] = smallbiz["total_assistance"].str.replace("$","")

In [43]:
pd.to_numeric(smallbiz["total_assistance"]).sum()

186642220.0

In [44]:
pd.to_numeric(smallbiz["grant_amount"]).sum() + pd.to_numeric(smallbiz["loan_amount"]).sum() 

186437222.0

### How many companies remain active?

In [45]:
smallbiz["status"].value_counts()

Active             1203
Out of Business      32
Default               1
Name: status, dtype: int64

### How many companies with past target dates have been reviewed?

In [46]:
# Use March 1 as cut-off date

today = dt.datetime(2016,3,1)

In [47]:
passed = smallbiz[smallbiz["contract_requirement_target_date"] < today]
passed["job_obligation_status"].value_counts()

Pending          647
Met              115
Not Met           62
Waived             1
Partially Met      1
Name: job_obligation_status, dtype: int64

In [50]:
passed["days_passed"] = map (lambda x : (today - x).days, passed["contract_requirement_target_date"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [51]:
passed["days_passed"].describe()

count     826.000000
mean      411.044794
std       215.302655
min         2.000000
25%       258.000000
50%       420.000000
75%       578.000000
max      1344.000000
Name: days_passed, dtype: float64

In [52]:
unreviewed = passed[passed["job_obligation_status"] == "Pending"]

def older_than(n):
    return unreviewed[unreviewed["days_passed"] > n]["days_passed"].count()
older_than (30)

626

In [56]:
unreviewed["days_passed"].describe()

count     647.000000
mean      385.989181
std       211.005447
min         2.000000
25%       219.000000
50%       374.000000
75%       552.000000
max      1069.000000
Name: days_passed, dtype: float64

In [53]:
days_col = []
counts_col = []
steps = 10
stepsize = 90
minus = 0 
for i in range (steps):
    days = (steps - 1 - i) * stepsize
    days_col.append (days)
    count = older_than (days) - minus
    minus = minus + count
    #print str(days) + "\t" + str(count)
    #print minus

    counts_col.append (count)

older = pd.Series(counts_col, index=days_col).to_frame().reset_index()
older.columns = ["days past target", "project count"]
older

Unnamed: 0,days past target,project count
0,810,15
1,720,11
2,630,56
3,540,93
4,450,79
5,360,76
6,270,115
7,180,75
8,90,71
9,0,56


In [54]:
older["project count"].sum()

647

In [55]:
older_than (180)

520

In [57]:
older_than (730)

25

### How many met or failed to meet job creation and retention goals?


In [138]:
passed["job_obligation_status"].value_counts()

Pending          647
Met              115
Not Met           62
Waived             1
Partially Met      1
Name: job_obligation_status, dtype: int64

In [139]:
passed["job_obligation_status"].value_counts().sum()

826

In [145]:
met_count = 115
not_met_count = 62
success_rate = met_count * 100 / (not_met_count + met_count)
fail_rate = not_met_count * 100 / (not_met_count + met_count)
print "success: " + str(round(success_rate)) + "%"
print "failure: " + str(round(fail_rate)) + "%"

success: 64.0%
failure: 35.0%


In [142]:
## We have some supplemental data on partner-audited projects
partners = pd.read_excel("data/Partners Job Audit.xlsx", skiprows=1)
partners["Met/Not Met/Pending"].value_counts()

Met        27
Not Met    16
Pending     6
Name: Met/Not Met/Pending, dtype: int64

In [143]:
## So it appears there are 115+27 met versus 62+16 not met when combining the two data sets
met_count = 115 + 27 
not_met_count = 62+16
success_rate = met_count * 100 / (not_met_count + met_count)
fail_rate = not_met_count * 100 / (not_met_count + met_count)
print "success: " + str(round(success_rate)) + "%"
print "failure: " + str(round(fail_rate)) + "%"

success: 64.0%
failure: 35.0%


### How many jobs were created and how does that compare with the total goal?


In [134]:
def jobs_at_start (df):
    return df["per_application_full_time_ct_jobs_at_application"].sum() +\
df["per_application_part_time_ct_jobs_at_application"].sum()

def jobs_goal (df):
    return df["contract_requirement_jobs_to_be_retained"].sum() + df["contract_requirement_jobs_to_be_created"].sum()

def jobs_at_end (df):
    return df["actual_jobs_at_time_of_review"].sum()

def jobs_gain (df):
    return jobs_at_end (df) - jobs_at_start(df)

In [128]:
jobs_goal(smallbiz)

19184.0

In [129]:
jobs_goal(passed)

11966.0

In [130]:
jobs_at_end(passed)

2774.0

In [131]:
reviewed = passed[passed["job_obligation_status"] != "Pending"]
jobs_at_end(reviewed)

2774.0

In [132]:
jobs_goal(reviewed)

2744.0

In [133]:
jobs_at_start(reviewed)

2212.0

In [135]:
jobs_gain(reviewed)

562.0

## Conclusions


As of March 1 there were 826 projects with target dates that had passed.

Out of those, 647 had no recorded audit result.

Out of those that had been audited, 115 met their goals and 62 failed to meet their goals. Adding in projects audited by partners, the total is (115+27) and (62+16). In either case, the failure/success rate is 64%/35%.

The total obligations for jobs created and retained is 19,184, not including partner projects.

Focusing only on the jobs that have been reviewed, the total job goal was 2,744 and the total jobs at time of reviews was 2,774. So even with a 35% rate of businesses failing to meet job goals, more jobs were created or retained overall than required. Of those reviewed projects, there was a net gain of 562 part time and full time jobs.
