In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import string
import time

pd.set_option('display.max_columns', 30)

In [2]:
# reading in raw data

# set data types
cols = ['PROJECT_ID', 'ABSTRACT', 'FY.x', 'PROJECT_TERMS', 'PROJECT_TITLE', 'DEPARTMENT', 'AGENCY', 'IC_CENTER', 
        'PROJECT_NUMBER', 'PROJECT_START_DATE', 'PROJECT_END_DATE', 'CONTACT_PI_PROJECT_LEADER', 'OTHER_PIS', 
        'CONGRESSIONAL_DISTRICT', 'DUNS_NUMBER', 'ORGANIZATION_NAME', 'ORGANIZATION_CITY', 'ORGANIZATION_STATE', 
        'ORGANIZATION_ZIP', 'ORGANIZATION_COUNTRY', 'BUDGET_START_DATE', 'BUDGET_END_DATE', 'CFDA_CODE', 'FY.y', 
        'FY_TOTAL_COST', 'FY_TOTAL_COST_SUB_PROJECTS']
dtypes = {col: 'str' for col in cols}
dtypes["FY_TOTAL_COST"] = 'float'
dtypes["FY_TOTAL_COST_SUB_PROJECTS"] = 'float'

df = pd.read_csv('../../data/original/working_federal_reporter_2020.csv', dtype = dtypes, engine='python')
print(df.shape)

(1156137, 26)


In [3]:
#remove rows with NULL abstracts

l1 = len(df)
df = df[~df.ABSTRACT.isnull()]
l2 = len(df)

print(l1-l2, "null ABSTRACTs removed")

42380 null ABSTRACTs removed


In [4]:
# drop abstracts with values of "ABSTRACT NOT PROVIDED" and "No abstract provided "

l1 = len(df)
df = df[df.ABSTRACT != 'ABSTRACT NOT PROVIDED']
df = df[df.ABSTRACT != 'No abstract provided']
l2 = len(df)

print(l1-l2, "ABSTRACT NOT PROVIDED and \"No abstract provided\" removed")

1324 ABSTRACT NOT PROVIDED and "No abstract provided" removed


In [5]:
# FY.x is the reliable fiscal year information so we rename this column to FY

df = df.rename(columns={'FY.x': 'FY'})

In [7]:
df.isnull().sum()

PROJECT_ID                          0
ABSTRACT                            0
FY                                  0
PROJECT_TERMS                    3536
PROJECT_TITLE                       0
DEPARTMENT                          0
AGENCY                              0
IC_CENTER                      595857
PROJECT_NUMBER                      0
PROJECT_START_DATE             149391
PROJECT_END_DATE               141193
CONTACT_PI_PROJECT_LEADER          43
OTHER_PIS                      969227
CONGRESSIONAL_DISTRICT          69263
DUNS_NUMBER                     12047
ORGANIZATION_NAME                1626
ORGANIZATION_CITY                5548
ORGANIZATION_STATE              12162
ORGANIZATION_ZIP                46167
ORGANIZATION_COUNTRY             5341
BUDGET_START_DATE              632754
BUDGET_END_DATE                632805
CFDA_CODE                      192234
FY.y                               19
FY_TOTAL_COST                  275416
FY_TOTAL_COST_SUB_PROJECTS    1041539
dtype: int64

In [8]:
temp = df[df["PROJECT_START_DATE"].isnull()]

In [10]:
len(temp)

149391

In [9]:
temp.isnull().sum()

PROJECT_ID                         0
ABSTRACT                           0
FY                                 0
PROJECT_TERMS                    816
PROJECT_TITLE                      0
DEPARTMENT                         0
AGENCY                             0
IC_CENTER                      55952
PROJECT_NUMBER                     0
PROJECT_START_DATE            149391
PROJECT_END_DATE              134650
CONTACT_PI_PROJECT_LEADER          0
OTHER_PIS                     146953
CONGRESSIONAL_DISTRICT         41637
DUNS_NUMBER                     3363
ORGANIZATION_NAME                196
ORGANIZATION_CITY               3374
ORGANIZATION_STATE              3938
ORGANIZATION_ZIP               41014
ORGANIZATION_COUNTRY            3374
BUDGET_START_DATE              79267
BUDGET_END_DATE                79267
CFDA_CODE                     149388
FY.y                               0
FY_TOTAL_COST                 112030
FY_TOTAL_COST_SUB_PROJECTS     88016
dtype: int64

In [None]:
# Fill missing values in Project Start/End Date with Budget Start/End Date
df['PROJECT_START_DATE'] = df['PROJECT_START_DATE'].fillna(df['BUDGET_START_DATE'])
df['PROJECT_END_DATE'] = df['PROJECT_END_DATE'].fillna(df['BUDGET_END_DATE'])

#df.isnull().sum()

#If START date is still missing, fill both start and end date with FY
df['PROJECT_START_DATE'] = df['PROJECT_START_DATE'].fillna(df['FY'])
df['PROJECT_END_DATE'] = df['PROJECT_END_DATE'].fillna(df['FY'])

#df.isnull().sum()