In [1]:
# %load_ext autoreload
# %autoreload 2

# Version Notes: 

### v1: 
* add Data_HourMinute for all exported datasources


# Tip for quick search

* Needs attention: the place where needs update or better logic
* question to be answered: the place where things are still not clear
* Manual Check: Unit test where you can drill in to find the data that leads to the check results for a specific project and specific check
* TODO: things needs to be done
* bookmark: stop point from last visit


# Admin Notes:


1. The AMTool dataset is archived daily as csv files and used for the project book check. 
The csv files are located at: 
r'\\ct.dot.ca.gov\dfshq\DIROFC\Asset Management\4e Project Book\Tableau Dashboards\DataLake'

2. The excel input files are checked daily and archived with datestamp whenever it is modified.
The continuously updated excel input files are located at: r'\\ct.dot.ca.gov\dfshq\DIROFC\Asset Management\4e Project Book\Projectbook_WorkingFolder\excel'
The excel input file are archived at: r'\\ct.dot.ca.gov\dfshq\DIROFC\Asset Management\4e Project Book\Tableau Dashboards\Data_MiscInput'
To recover the archived excel file used in project book check for a target date, select the excel file with latest datestamp but is still earlier than the target date.

3. The check summary export action is logged daily. It can be used for daily monitoring. 
The file export log is located at: \\ct.dot.ca.gov\dfshq\DIROFC\Asset Management\4e Project Book\Projectbook_WorkingFolder\output_internal\log

4. The published data are at:

    * csv files for district asset manager: http://svgcshopp.dot.ca.gov/DataLake/ProjectBookCheck/
    * csv files for HQ AM: \\ct.dot.ca.gov\dfshq\DIROFC\Asset Management\4e Project Book\Projectbook_WorkingFolder\output_internal
    * tableau workbook with live data source: https://tableau.dot.ca.gov/#/site/AssetManagement/workbooks/1815/views


# General Approach

use Minor raw data as basis for data checks. 
Each project only occupies one line

can expand columns, only if it will not create duplicate rows in the SHOPP raw dataset. 


# Data clean process

* funding amount: remove dollar sign, 
* fill missing value, string, numerical, 
* remove leading single quote for string value
* strip off leading and trailing space 

* regulate column names




# Import common modules

<a id='TableOfContents'></a>

# Table Of Contents

## Data Preprocessing

### [Global Constants](#GlobalConstants)


### [Load and cleanup source data](#Read_Data)


## Add fields to SHOPP raw data (calculate and join)
* [Calculated Fields](#AddDataColumns)
* [Join Tables](#DataJoining)



## Data Check and Export


## [Data Check List](#Issue_Table1)
The main table of check issues, 
one issue per row, 


* [Will_this_project_be_included_in_the_Project_Book](#Will_this_project_be_included_in_the_Project_Book)
* [Does_project_cost_exceed_Minor_Program_limits](#Does_project_cost_exceed_Minor_Program_limits)



## [Export Internal Check Summary](#Export_internal_check_summary)
* internal check summary (csv)


## [Final Clean Up](#FinalCleanUp)


In [2]:

from datetime import datetime
import os.path

# import requests
import pandas as pd

import numpy as np
import re

import shutil

In [3]:
import time
start_time = time.time()

In [4]:
#show dataframe without skip column
pd.options.display.max_columns = 100

In [5]:
# from config_datasource import *
import projectbookcheck_utilityfunction as uf

You are using the Extract API 2.0, please save the output as .hyper format


<a id='GlobalConstants'></a>
## Global Constants

In [6]:
# # use 'csv' to read data from data lake, use 'live' to read data directly from AmTool Server
# DATA_SOURCE_TYPE = 'csv'

# # DATALAKE_FOLDER = r'\\ct.dot.ca.gov\dfshq\DIROFC\Asset Management\4e Project Book\Tableau Dashboards\DataLake'

# #input data
# DATALAKE_FOLDER = r'\\ct.dot.ca.gov\dfshq\DIROFC\Asset Management\4e Project Book\Tableau Dashboards\DataLake'
# PROJECTBOOKCHECK_INPUT_FOLDER = r'\\ct.dot.ca.gov\dfshq\DIROFC\Asset Management\4e Project Book\Projectbook_WorkingFolder\excel'

# #output data
# DATALAKE_HTTPSEVER_FOLDER = 'C:\inetpub\wwwroot\DataLake\ProjectBookCheck'
# PROJECTBOOKCHECK_OUTPUT_FOLDER = r'\\ct.dot.ca.gov\dfshq\DIROFC\Asset Management\4e Project Book\Projectbook_WorkingFolder\output_internal'

# #log data
# log_folder = r'\\ct.dot.ca.gov\dfshq\DIROFC\Asset Management\4e Project Book\Projectbook_WorkingFolder\output_internal\log'

# TARGET_FY = 2021


# # CURRENT_FY

# TARGETDATE = datetime.today().strftime("%m-%d-%Y")

In [7]:
from constants import *

In [8]:
filename = 'TenYrShopp_RawData_'
path_to_file = r'{}\{}.csv'.format(DATALAKE_HTTPSERVER_FOLDER, filename)
t = os.path.getmtime(path_to_file)

# File_TimeStamp = datetime.fromtimestamp(t).strftime("%m-%d-%Y_%H-%M")
Data_TimeStamp = datetime.fromtimestamp(t).strftime("%m-%d-%Y %H:%M:%S")

TARGETDATE = datetime.fromtimestamp(t).strftime("%m-%d-%Y")

CURRENT_FY = uf.fiscalyear(datetime.today())

<a id='Read_Data'></a>

# Read Data


In [9]:
File_TimeStamp = ''

if DATA_SOURCE_TYPE == 'csv':
    filename = 'Minor_Project_Details_Raw_Data_'
    df_Minor_raw_data = pd.read_csv(r'{}\{}{}.csv'.format(DATALAKE_FOLDER, filename, File_TimeStamp))

    filename = 'Minor_Performance_Raw_Data_'
    df_Minor_perf_raw_data = pd.read_csv(r'{}\{}{}.csv'.format(DATALAKE_FOLDER, filename, File_TimeStamp))

    filename = 'Programming_Summary_'
    df_Programming_Summary = pd.read_csv(r'{}\{}{}.csv'.format(DATALAKE_FOLDER, filename, File_TimeStamp))

    filename = 'Minor_Project_Postmile_Check_'
    df_Minor_pm_check = pd.read_csv(r'{}\{}{}.csv'.format(DATALAKE_FOLDER, filename, File_TimeStamp), header = 0)
    
    filename = 'Minor_Rawdata_Pavement_Worksheet_'
    df_Minor_Pavement_WS = pd.read_csv(r'{}\{}{}.csv'.format(DATALAKE_FOLDER, filename, File_TimeStamp), header = 0, skiprows =2)
    
    filename = 'Minor_Rawdata_TMS_Worksheet_'
    df_Minor_TMS_WS = pd.read_csv(r'{}\{}{}.csv'.format(DATALAKE_FOLDER, filename, File_TimeStamp), header = 0)


else:
    print('skip getting csv data.')

In [10]:
filename = 'Minor_Activity_Crosswork.xlsx'

df_Activity_CrossWalk = pd.read_excel(r'{}\{}'.format(PROJECTBOOKCHECK_INPUT_FOLDER, filename))

temp = df_Activity_CrossWalk.groupby('Main Activity Category')['ActID'].agg(list).reset_index()

dict_Activity_CW = dict(zip(temp['Main Activity Category'], temp['ActID']))

In [11]:
#question answered: 2021 and 2022 approved list project id duplication will be resolved with later excel files

# Data quality check and cleaning

<a id='Minor_Raw_Data'></a>
## Minor Raw Data

In [12]:
dict_rename = {'Project ID':'EFIS',
               'ID': 'AMT_ID', 
              'FY.1': 'FY_ALN',
               'Prog Appr Date': 'Prog Appr Date_ALN',
               'FY': 'FY_WP',
               'Prog Approval Date': 'Prog Appr Date_WP',
              }
df_Minor_raw_data = df_Minor_raw_data.rename(dict_rename, axis = 1)

In [13]:
# df_Minor_raw_data.shape

In [14]:
# for programmed FY year of 9999, skip all the checks

# No need to check, since the raw data is filtered before download

In [15]:
df_Minor_raw_data['District'] = df_Minor_raw_data['District'].apply(uf.remove_punction)
df_Minor_raw_data['District'] = df_Minor_raw_data['District'].astype(int)

In [16]:
df_Minor_raw_data['Section'] = df_Minor_raw_data['Section In Use']

In [17]:
#rename

dict_rename = {
    'ID': 'AMT_ID',
    'Plan Year': 'Pavement_PlanYear'
}

df_Minor_Pavement_WS.rename(dict_rename, axis = 1, inplace=True)

df_Minor_Pavement_WS['Pavement_PlanYear'] = df_Minor_Pavement_WS['Pavement_PlanYear']%100

In [18]:
dict_rename = {
    'ID': 'AMT_ID',
    'RTL Plan Year': 'TMS_PlanYear'
}

df_Minor_TMS_WS.rename(dict_rename, axis = 1, inplace=True)

df_Minor_TMS_WS['TMS_PlanYear'] = df_Minor_TMS_WS['TMS_PlanYear']%100

<a id='Minor_Perf_RawData'></a>
## Minor_Perf_RawData

In [19]:
#rename columns
dict_rename_perf_rawdata = {
                           'ID': 'AMT_ID',
#                             'ProjectedRTL FY': 'Projected RTL FY',
              }

df_Minor_perf_raw_data = df_Minor_perf_raw_data.rename(dict_rename_perf_rawdata, axis = 1)

In [20]:
cols_strip = ['EA','EFIS']
for c in cols_strip :
    df_Minor_perf_raw_data[c] = df_Minor_perf_raw_data[c].str.strip("'")

In [21]:
#data clean 
#data type regulation

df_Minor_perf_raw_data['Quantity'] = df_Minor_perf_raw_data['Quantity'].fillna(0)
df_Minor_perf_raw_data['Assets in Good Cond'] = df_Minor_perf_raw_data['Assets in Good Cond'].fillna(0)
df_Minor_perf_raw_data['Assets in Fair Cond'] = df_Minor_perf_raw_data['Assets in Fair Cond'].fillna(0)
df_Minor_perf_raw_data['Assets in Poor Cond'] = df_Minor_perf_raw_data['Assets in Poor Cond'].fillna(0)
df_Minor_perf_raw_data['New Assets Added'] = df_Minor_perf_raw_data['New Assets Added'].fillna(0)

# df_Minor_perf_raw_data['EFIS'] = df_Minor_perf_raw_data['EFIS'].apply(regulate_EFIS)
df_Minor_perf_raw_data['EFIS'] = pd.to_numeric(df_Minor_perf_raw_data['EFIS'], errors='coerce')


In [22]:
#data trimming
#row
df_Minor_perf_raw_data= df_Minor_perf_raw_data[df_Minor_perf_raw_data['District'] != 56]
#column
df_Minor_perf_raw_data.drop(['PID Cycle', 'TYP','ProjectedSHOPP Cycle','RequestedRTL FY','DistrictPriority'],
  axis='columns', inplace=True, errors='ignore')

In [23]:
df_Minor_perf_raw_data.name = 'df_Minor_perf_raw_data'

<a id='Counties'></a>
## Counties


In [24]:
filename = 'Counties.xlsx'

df_counties = pd.read_excel(r'{}\{}'.format(PROJECTBOOKCHECK_INPUT_FOLDER, filename))

In [25]:
df_counties['Co. Name Abbr.'] = df_counties['Co. Name Abbr.'].str.upper()

In [26]:
# df_counties.shape

In [27]:
df_counties.name = 'df_counties'

In [28]:
# df_perf_raw_prog_county = df_perf_raw_prog_candidate.merge(df_counties, how = 'left', left_on = 'County', right_on = 'Co. Name Abbr.')

In [29]:
#no need for the following, already added to the df_Minor_perf_raw_data

# #rename columns
# dict_rename_4= {
#                'Performance Objective':'Performance Objective Original', 
#               }

# df_perf_raw_prog_county = df_perf_raw_prog_county.rename(dict_rename_4, axis = 1)

<a id='Postmile_Check'></a>
## Postmile Check

In [30]:
dict_PM_ck_rename = {
 'ID': 'AMT_ID',
 '№': 'No'                            }
df_Minor_pm_check.rename(dict_PM_ck_rename, axis = 1, inplace = True)

In [31]:
df_Minor_pm_check['District'] = df_Minor_pm_check['District'].str.strip("'")
df_Minor_pm_check['District'] =df_Minor_pm_check['District'].astype(int)
df_Minor_pm_check = df_Minor_pm_check[df_Minor_pm_check['District']!= 56]

In [32]:
df_Minor_pm_check.name = 'df_Minor_pm_check'
# df_Minor_pm_check.shape

<a id='ProgrammingSummary'></a>
## Programming Summary

In [33]:
dict_rename = {'ID': 'AMT_ID',
              }
df_Programming_Summary.rename(dict_rename, axis = 1, inplace = True)

In [34]:
cols_strip = ['EA','EFIS']
for c in cols_strip :
    df_Programming_Summary[c] = df_Programming_Summary[c].str.strip("'")
    
df_Programming_Summary['EFIS'] = df_Programming_Summary['EFIS'].apply(uf.regulate_EFIS)
df_Programming_Summary['EFIS'] = pd.to_numeric(df_Programming_Summary['EFIS'], errors='coerce')

# Approved Project List

In [35]:
filename = 'Minor Approved list.xlsx'

df_approved = pd.read_excel(r'{}\{}'.format(PROJECTBOOKCHECK_INPUT_FOLDER, filename, sheet_name='Minor Approved'))

dict_rename = {
    'Project ID':'EFIS',
              }
df_approved = df_approved.rename(dict_rename, axis = 1)

df_approved['EFIS'] = df_approved['EFIS'].apply(uf.regulate_EFIS)
df_approved['EFIS'] = pd.to_numeric(df_approved['EFIS'], errors='coerce')

df_approved['FY of the Approved List'] = df_approved['FY'].str[-2:].astype(int)

target_cols = ['EFIS','EA','Minor', 'FY of the Approved List', 'Performance Value','Performance Measure','Program Code','Construction Capital Cost ($K)']

df_approved = df_approved.sort_values(by =['EFIS','FY of the Approved List'], ascending = True)
df_approved = df_approved.groupby('EFIS').first().reset_index()

df_approved['In the Approved List?'] = 'Yes'

<a id='AddDataColumns'></a>
## Calculate and join additional fields


In [36]:
#this logic needs to consider the programming list

df_Minor_raw_data['Unique EA'] = df_Minor_raw_data.apply(uf.calc_unique_EA, axis = 1)

df_Minor_raw_data['FY In Use'] = df_Minor_raw_data['FY.2'].str[-2:]
df_Minor_raw_data['FY In Use'].fillna(0, inplace=True)
df_Minor_raw_data['FY In Use'] = df_Minor_raw_data['FY In Use'].astype(int)

## Check if the pavement and TMS plan year matches 

In [37]:
def is_all_ok(series):
    for v in series.values:
        if v != 'OK':
            return v
    return 'OK'

In [38]:
#check the PlanYear matches the FY In Use.
ck_col = 'Is Pavement Plan Year Matching Project FY?'

temp = pd.merge(df_Minor_Pavement_WS[['AMT_ID','Section','Pavement_PlanYear']],  
                df_Minor_raw_data[['AMT_ID','Section','EFIS', 'FY In Use']],
                how = 'inner', left_on = ['AMT_ID','Section'], right_on = ['AMT_ID','Section']
               )

# temp = pd.merge(temp, df_approved[['EFIS','FY of the Approved List']],
#                 how = 'left', left_on = 'EFIS', right_on = 'EFIS'
#                )

def ck_pavement_planyear(df):
    if df['Pavement_PlanYear'] == df['FY In Use']:
        return 'OK'
    else:
        return 'Pavement worksheet plan year does not match FY of the Approved List.'
    
    
temp[ck_col] = temp.apply(ck_pavement_planyear, axis = 1 )

temp = temp.groupby(['AMT_ID'])[ck_col].agg(is_all_ok).reset_index()

df_Minor_raw_data = pd.merge(df_Minor_raw_data, temp, how = 'left', left_on = 'AMT_ID', right_on = 'AMT_ID')

df_Minor_raw_data[ck_col].fillna('OK', inplace=True)

In [39]:
# df_Minor_Pavement_WS[df_Minor_Pavement_WS.AMT_ID == 'MA000493']

In [40]:
# #UnitTest

# AMT_IDs = ['MA000518', 'MA000898', 'MA000493', 'MA001276' ]
# ck_col = 'Is Pavement Plan Year Matching Project FY?'

# df_Minor_raw_data[(df_Minor_raw_data['AMT_ID'].isin(AMT_IDs)) ][[
#     'AMT_ID','Section','FY In Use', ck_col]]


In [41]:
ck_col = 'Is TMS Plan Year Matching Project FY?'

temp = pd.merge(df_Minor_TMS_WS[['AMT_ID','Section','TMS_PlanYear']], 
                df_Minor_raw_data[['AMT_ID','Section','EFIS', 'FY In Use']],
                how = 'inner', left_on = ['AMT_ID','Section'], right_on = ['AMT_ID','Section']
               )

# temp = pd.merge(temp, df_approved[['EFIS','FY of the Approved List']],
#                 how = 'left', left_on = 'EFIS', right_on = 'EFIS'
#                )

def ck_TMS_planyear(df):
    if df['TMS_PlanYear'] == df['FY In Use']:
        return 'OK'
    else:
        return 'TMS worksheet plan year does not match FY of the Approved List.'
    
    
temp[ck_col] = temp.apply(ck_TMS_planyear, axis = 1 )

temp = temp.groupby(['AMT_ID'])[ck_col].agg(is_all_ok).reset_index()

df_Minor_raw_data = pd.merge(df_Minor_raw_data, temp, how = 'left', left_on = 'AMT_ID', right_on = 'AMT_ID')

df_Minor_raw_data[ck_col].fillna('OK', inplace=True)

In [42]:

# #UnitTest
# df_Minor_TMS_WS[df_Minor_TMS_WS.AMT_ID == 'MB000500']

# AMT_IDs = ['MB000500', 'MA000349', 'MA000518']
# ck_col = 'Is TMS Plan Year Matching Project FY?'

# df_Minor_raw_data[(df_Minor_raw_data['AMT_ID'].isin(AMT_IDs)) ][[
#     'AMT_ID','Section','FY In Use', ck_col]]


# Check programming summary

In [43]:
#filter data to keep Minor program and active section only.
# df_Programming_Summary
print(df_Programming_Summary.shape)
df_Programming_Summary_filtered = pd.merge(df_Programming_Summary, df_Minor_raw_data[['AMT_ID','Section',]],
               how= 'inner', left_on = ['AMT_ID','Section',], right_on = ['AMT_ID','Section',])
print(df_Programming_Summary_filtered.shape)

(15350, 25)
(1031, 25)


In [44]:
print(df_Programming_Summary_filtered.shape)
df_Programming_Summary_filtered = pd.merge(df_Programming_Summary_filtered, df_approved,
               how= 'left', left_on = ['EFIS'], right_on = ['EFIS'],
               suffixes=['','_ApprovedList'])
print(df_Programming_Summary_filtered.shape)
df_Programming_Summary_filtered['In the Approved List?'].fillna('No', inplace=True)

(1031, 25)
(1031, 36)


In [45]:
ck_col = 'Matches Minor Approved List Performance Measure?'

def ck_performance_measure(df):
    if pd.isna(df['Performance Measure_ApprovedList']):
        return 'Not in the Approved Lists'
    else:
        if df['Performance Measure_ApprovedList'] == df['Performance Measure']:
            return 'Yes'
        else:
            return 'No'

df_Programming_Summary_filtered[ck_col]= df_Programming_Summary_filtered.apply(ck_performance_measure, axis = 1)

In [46]:

ck_col = 'Matches Minor Approved List Performance Value?'
def ck_performance_value(df):
    if pd.isna(df['Performance Value_ApprovedList']):
        return 'Not in the Approved Lists'
    else:
        if df['Performance Value_ApprovedList'] == df['Performance Value']:
            return 'Yes'
        else:
            return 'No'

df_Programming_Summary_filtered[ck_col]= df_Programming_Summary_filtered.apply(ck_performance_value, axis = 1)

In [47]:
ck_col = 'Does the performance summary match the performance value and measure reported to CTC?'
def ck_performance(df):
    if df['Matches Minor Approved List Performance Value?'] == 'Not in the Approved Lists':
        return 'Not in the Approved Lists'
    elif (df['Matches Minor Approved List Performance Value?'] == 'Yes') and (df['Matches Minor Approved List Performance Measure?'] == 'Yes'):
        return 'Yes'
    else:
        return 'No'
    

df_Programming_Summary_filtered[ck_col]= df_Programming_Summary_filtered.apply(ck_performance, axis = 1)

<a id='Export_programming_summary'></a>

### Export Programming Summary

In [48]:
out_col =df_Programming_Summary_filtered.columns

filename = 'Minor_Programming_Summary'
df_out = df_Programming_Summary_filtered[out_col]
df_out['Data_TimeStamp'] = Data_TimeStamp

uf.export_csv(df_out, filename, PROJECTBOOKCHECK_HTTPSERVER_FOLDER, LOG_FILE)

uf.export_hyper(df_out, filename, LOG_FILE)

processing table: 1031it [00:00, 5154.85it/s]


Signing into AssetManagement at https://tableau.dot.ca.gov
Publishing Minor_Programming_Summary.hyper to Sandbox_ProjectBookCheck_Automation...


# Check Minor Data

In [49]:
df_Minor_raw_data['Program Code In Use'] = df_Minor_raw_data.apply(lambda x: x['Program Code'] if x['Section'] == 'WP' else x['Program Code.1'], axis = 1)
df_Minor_raw_data['Construction Capital In Use ($K)'] = df_Minor_raw_data.apply(lambda x: x['Construction Capital ($K)'] if x['Section'] == 'WP' else x['Total Capital Project Cost ($K)'], axis = 1)

In [50]:
df_Minor_raw_data['Activity Category In Use'] = df_Minor_raw_data.apply(
    lambda x: x['Activity Category'] if x['Section'] == 'WP' else x['Activity Category.1'],
    axis = 1)

In [51]:
df_Minor_raw_data['FY In Use'] = pd.to_numeric(df_Minor_raw_data['FY In Use'], errors='coerce')

In [52]:
df_Minor_raw_data['Is FY missing?'] = df_Minor_raw_data['FY In Use'].apply(lambda x: 'FY is missing' if x == 0 else 'OK' )

In [53]:
df_Minor_raw_data['Legacy Project?'] = df_Minor_raw_data.apply(lambda x: 'Yes' if float(x['FY In Use']) < 21 else 'No', axis = 1)
#for legacy project, do not flag the project if the information is missing. 


In [54]:
# df_approved

In [55]:
#question answered: we focus on checking the data only in the Section

In [56]:
# df_Minor_raw_data_backup = df_Minor_raw_data.copy()

# df_Minor_raw_data = df_Minor_raw_data_backup

In [57]:
print(df_Minor_raw_data.shape)

df_Minor_raw_data = pd.merge(df_Minor_raw_data, df_approved[['EFIS','Minor','EA','In the Approved List?','FY of the Approved List','Program Code','Construction Capital Cost ($K)' ]],
                            how = 'left', left_on = 'EFIS', right_on = 'EFIS', suffixes=['','_ApprovedList'])

print(df_Minor_raw_data.shape)

df_Minor_raw_data['In the Approved List?'].fillna('No', inplace= True)


(1273, 88)
(1273, 94)


In [58]:
# temp = pd.merge(df_Minor_raw_data, df_approved[['EFIS','Minor','EA','In the Approved List?','FY of the Approved List','Program Code','Construction Capital Cost ($K)' ]],
#                             how = 'outer', left_on = 'EFIS', right_on = 'EFIS', suffixes=['','_ApprovedList'])

# print(temp.shape)

# # df_Minor_raw_data['In the Approved List?'].fillna('No', inplace= True)

# temp = pd.merge(df_Minor_raw_data, df_approved[['EFIS','Minor','EA','In the Approved List?','FY of the Approved List','Program Code','Construction Capital Cost ($K)' ]],
#                             how = 'left', left_on = 'EFIS', right_on = 'EFIS', suffixes=['','_ApprovedList'])

# print(temp.shape)

# df_Minor_raw_data['In the Approved List?'].fillna('No', inplace= True)

In [59]:
df_Missing_projects = df_approved[~df_approved['EFIS'].isin(df_Minor_raw_data['EFIS'].values)][['EFIS','District','Minor', 'EA', 'FY of the Approved List','Program Code','In the Approved List?']]

df_Missing_projects['AMT_ID'] = 'M0'

In [60]:
df_approved.columns

Index(['EFIS', 'FY', 'Minor', 'District', 'Location/Description', 'EA',
       'Construction Capital Cost ($K)', 'Program Code', 'Performance Value',
       'Performance Measure', 'FY of the Approved List',
       'In the Approved List?'],
      dtype='object')

### is EFIS duplicate within Minor raw data?

In [61]:
#Mark project if EFIS is missing

def is_EFIS_missing(df):
    if pd.isna(df['EFIS']) or df['EFIS'] == 0:
        return 'This project is missing EFIS (Project ID).'
    else:
        return 'OK'

df_Minor_raw_data['Was Project ID (EFIS) completed?'] = df_Minor_raw_data.apply(is_EFIS_missing, axis =1 ) 

In [62]:
# Was Project ID (EFIS) completed?	1	Was Project ID (EFIS) completed?

In [63]:
temp = df_Minor_raw_data.groupby(['EFIS'])['AMT_ID'].agg([pd.Series.nunique, list]).reset_index()
temp['AMT_IDs'] = temp['list'].apply(lambda l: ','.join(l))
duplicated_EFIS= temp[temp['nunique']> 1]

df_Minor_raw_data.drop(columns=['nunique','AMT_IDs'],inplace=True , errors='ignore')
print(df_Minor_raw_data.shape)
df_Minor_raw_data = pd.merge(df_Minor_raw_data, duplicated_EFIS, 
                             how = 'left', left_on = ['EFIS'], right_on=['EFIS'])
print(df_Minor_raw_data.shape)

def ck_EFIS_Uniqueness(df):
    if pd.isna(df['nunique']):
        return 'OK'
    elif df['Was Project ID (EFIS) completed?'] != 'OK':
        return 'OK'  #skip this since it is marked seperately
    else:
        return 'Duplicate EFIS {} is found in the following projects: {}'.format(df['EFIS'], df['AMT_IDs'])
    
df_Minor_raw_data['Is EFIS Repeated in Minor Profile?'] = df_Minor_raw_data.apply(ck_EFIS_Uniqueness, axis = 1)

(1273, 95)
(1273, 98)


In [64]:
def is_EA_missing(df):
    if pd.isna(df['EA']) or df['EA'] == '':
        return 'This project is missing EA.'
    else:
        return 'OK'

df_Minor_raw_data['Is EA missing?'] = df_Minor_raw_data.apply(is_EA_missing, axis =1 ) 

In [65]:
# df_Minor_raw_data['Is EA missing?'].unique()

### flag if District + EA duplicate within Minor raw data

In [66]:
temp = df_Minor_raw_data.groupby(['Unique EA'])['AMT_ID'].nunique().reset_index(name = 'UnqiueEA_Counts')
duplicated_EA= temp[temp['UnqiueEA_Counts']> 1]

df_Minor_raw_data.drop(columns=['UnqiueEA_Counts'],inplace=True , errors='ignore')

df_Minor_raw_data = pd.merge(df_Minor_raw_data, duplicated_EA[['Unique EA','UnqiueEA_Counts']].drop_duplicates(), 
                             how = 'left', left_on = ['Unique EA'], right_on=['Unique EA'])

def ck_EA_Uniqueness(df):
    if pd.isna(df['UnqiueEA_Counts']):
        return 'OK'
    elif df['Is EA missing?'] != 'OK':
        return 'OK'  #skip this since it is marked seperately
    else:
        return 'Duplicate District_EA {} is found in the following projects: {}'.format(df['Unique EA'], df['AMT_IDs'])
    
df_Minor_raw_data['Is EA Repeated in Minor Profile?'] = df_Minor_raw_data.apply(ck_EA_Uniqueness, axis = 1)

In [67]:
ck_col =  'Does EA Need Updates?'
#
def ck_EA_consistancy(df):
    if df['Is EA missing?'] != 'OK':
        return 'EA information is missing.'
    elif df['In the Approved List?'] != 'Yes':
        return 'OK'
    else: 
        #DEBUG: need to make all upper case
        if str(df['EA']).upper() == str(df['EA_ApprovedList']).upper():
            return 'OK'
        else:
            return 'Update EA. It does not match EA in Approved List of year {}'.format(df['FY of the Approved List'])
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_EA_consistancy, axis = 1)

In [68]:
# #UnitTest

# AMT_IDs = ['MB000373', 'MA000119']
# ck_col = 'Does EA Need Updates?'

# df_Minor_raw_data[(df_Minor_raw_data['AMT_ID'].isin(AMT_IDs)) ][[
#     'AMT_ID','EA','EA_ApprovedList', ck_col]]

Unnamed: 0,AMT_ID,EA,EA_ApprovedList,Does EA Need Updates?
99,MA000119,48420,48420,OK
315,MB000373,1l410,1L410,OK


In [69]:
ck_col = 'Does FY Need Updates?'

def ck_FY_consistancy(df):
    if df['FY In Use'] == 0:
        return 'FY information is missing.'
    elif df['In the Approved List?'] != 'Yes':
        return 'OK'
    else:    
        if df['FY In Use'] == df['FY of the Approved List']:
            return 'OK'
        else:
            return 'Please update FY. It is in the {} Approved List'.format(df['FY of the Approved List'])
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_FY_consistancy, axis = 1)

In [70]:
ck_col = 'Is the project with FY In Use before 2023 included in the approved list?'

def ck_FY_all_approved_projects(df):
    if df['FY In Use'] == 0:
        return 'OK'
    elif df['FY In Use'] < 23 and df['In the Approved List?'] != 'Yes':
        return 'The project has FY before 2023 but is not included in the approved list. Please update the FY or work with the Minor program to included it or obselete it.'
    else:
        return 'OK'
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_FY_all_approved_projects, axis = 1)

In [71]:
# #UnitTest
# AMT_IDs = ['MB000111']
# ck_col = 'Is the project with FY In Use before 2023 included in the approved list?'

# df_Minor_raw_data[(df_Minor_raw_data['AMT_ID'].isin(AMT_IDs)) ][['AMT_ID','FY In Use', ck_col]]

In [72]:
ck_col = 'Does Program Code Need Updates?'

def ck_program_code_update(df):
    if df['In the Approved List?'] != 'Yes':
        return 'OK'
        #if it is not on the approved list, do not check.
    else: 
        if pd.isna(df['Program Code_ApprovedList']) or (df['Program Code In Use'] == df['Program Code_ApprovedList']):
            return 'OK'
        else:
            return 'The program code for Section In Use does not match Approved project list.'

df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_program_code_update, axis = 1)

In [73]:
ck_col = 'Does Minor Type(A:B) Match Approved Project List?'

def ck_minor_category(df):
    if df['In the Approved List?'] != 'Yes':
        return 'OK'
    else: 
        if (df['Minor'] == df['Minor_ApprovedList']):
            return 'OK'
        else:
            return 'The Minor Type for Section In Use does not match Approved project list. Please work with the Minor HQ to reconcile.'

df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_minor_category, axis = 1)

In [74]:
#UnitTest
AMT_ID = 'MB000370' 
AMT_ID = 'MB001220' 
AMT_ID = 'MA000278' 
ck_col = 'Does Minor Type(A:B) Match Approved Project List?'

df_Minor_raw_data[(df_Minor_raw_data['AMT_ID'] == AMT_ID) ][['AMT_ID','In the Approved List?','Minor', 'Minor_ApprovedList', 
                                                             ck_col]]

Unnamed: 0,AMT_ID,In the Approved List?,Minor,Minor_ApprovedList,Does Minor Type(A:B) Match Approved Project List?
234,MA000278,Yes,A,B,The Minor Type for Section In Use does not mat...


In [75]:
ck_col = 'Project Has Qualified Activity?'

df_Activity_CrossWalk['QualifiedActivity'] = 'Yes'

temp = pd.merge(df_Minor_perf_raw_data, df_Activity_CrossWalk, 
         how='left', left_on=['Main Activity Category', 'ActID'], right_on=['Main Activity Category', 'ActID'])

temp['QualifiedActivity'].fillna('No', inplace=True)

temp = temp.groupby(['AMT_ID', 'Section'])['QualifiedActivity'].agg(list).reset_index()

temp[ck_col] = temp['QualifiedActivity'].apply(
    lambda x: 'Yes' if 'Yes' in x else 'No'
)

print(df_Minor_raw_data.shape)
df_Minor_raw_data = pd.merge(df_Minor_raw_data, temp[['AMT_ID', 'Section', ck_col]], 
         how='left', left_on=['AMT_ID', 'Section'], right_on=['AMT_ID', 'Section'])
print(df_Minor_raw_data.shape)

(1273, 107)
(1273, 108)


In [76]:
ck_name = 'Does project have at least one performance activity related to the Activity Category?'

def ck_qualified_activity(df):
    if pd.isna(df['Project Has Qualified Activity?']):
        return 'The performance data is missing for this project.'
    elif df['Project Has Qualified Activity?'] == 'No':
        try: 
            act_list = ','.join(dict_Activity_CW[df['Activity Category In Use']])
        except:
            act_list = 'please find out from HQ AM for the latest list.'
        return 'Please update the performance to include at least one qualified activity. The qualified activity for {} are {}'.format(df['Activity Category In Use'], act_list)
    else:
        return 'OK'

df_Minor_raw_data[ck_name] = df_Minor_raw_data.apply(ck_qualified_activity, axis=1)

In [77]:
ck_col = 'Does Construction Capital Cost ($K) Need Updates?'

def ck_construction_capital_cost(df):
    if df['In the Approved List?'] != 'Yes':
        return 'OK'
    else:
        if abs(df['Construction Capital Cost ($K)'] - df['Construction Capital In Use ($K)']) < 0.01:
            return 'OK'
        else:
            if (df['Section'] == 'ALN' and df['FY of the Approved List'] == 22):
                #exception for construction cost does not match.
                return 'OK'
            else: 
                return 'Update Capital Cost. It does not match Approved List'

        
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_construction_capital_cost, axis = 1)

### flag if no performance
 performance value can be zero

In [78]:
ck_col = 'Was Performance Tab Completed in Section In Use?'

temp = df_Minor_perf_raw_data.groupby(['AMT_ID','Section']).first().reset_index()
temp['Was performance tab completed?'] = 'Yes'

df_Minor_raw_data.drop(columns=['Was performance tab completed?',],inplace=True , errors='ignore')
print(df_Minor_raw_data.shape)

df_Minor_raw_data = pd.merge(df_Minor_raw_data, temp[['AMT_ID','Section','Was performance tab completed?']].drop_duplicates(), 
                             how = 'left', left_on = ['AMT_ID','Section'], right_on=['AMT_ID','Section'])

df_Minor_raw_data['Was performance tab completed?'].fillna('No', inplace=True)

print(df_Minor_raw_data.shape)


def ck_performance_availability(df):
    if df['Was performance tab completed?'] == 'No':
        return '"Please complete Performance Tab in Section {}'.format(df['Section'])
    else:
        return 'OK'

df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_performance_availability, axis = 1)

(1273, 110)
(1273, 111)


In [79]:
#TODO: to add a column 'Was Performance Tab Completed in ALN Band?'
ck_col = 'Was performance tab completed in ALN Band?'
temp_ALN = temp[temp['Section'] == 'ALN']
temp_ALN[ck_col] = 'Yes'

df_Minor_raw_data.drop(columns=[ck_col],inplace=True , errors='ignore')
print(df_Minor_raw_data.shape)

df_Minor_raw_data = pd.merge(df_Minor_raw_data, temp_ALN[['AMT_ID',ck_col]].drop_duplicates(), 
                             how = 'left', left_on = ['AMT_ID'], right_on=['AMT_ID'])

df_Minor_raw_data[ck_col].fillna('No', inplace=True)

print(df_Minor_raw_data.shape)

(1273, 112)
(1273, 113)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_ALN[ck_col] = 'Yes'


In [80]:
col_name = 'Does the performance summary match the performance value and measure reported to CTC?'

print(df_Minor_raw_data.shape)

df_Minor_raw_data.drop(columns=[col_name],inplace=True , errors='ignore')

#join
df_Minor_raw_data = pd.merge(
    df_Minor_raw_data, 
    df_Programming_Summary_filtered[['AMT_ID', 'Section', col_name]],
    how='left', left_on=['AMT_ID', 'Section'], right_on=['AMT_ID', 'Section']
)
print(df_Minor_raw_data.shape)

(1273, 113)
(1273, 114)


In [81]:
ck_col = 'For approved Minor Projects, does the performance summary match the performance value and measure reported to CTC?'

def ck_performance_value(df):
    if df['In the Approved List?'] != 'Yes':
        return 'OK'
    else:
        if df['Does the performance summary match the performance value and measure reported to CTC?'] =='No':
            return 'Please update the performance tab to match CTC.'
        else:
            return 'OK'

        
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_performance_value, axis = 1)


# For approved Minor Projects, 
# #fill na
# #question to be answered: for projects not in the programming summary list, we assigned the performance value and measure check to No

# df_Minor_raw_data[col_name].fillna('No', inplace=True)

# print(df_Minor_raw_data.shape)

### flag if total project cost is zero

In [82]:
ck_col = 'Was the Project Cost Completed?'

def ck_total_project_cost(df):
    if pd.isna(df['Total Project Cost ($K)']) or df['Total Project Cost ($K)'] == 0:
        return 'Total project cost can not Empty or zero.'
    else:
        return 'OK'
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_total_project_cost, axis = 1)

### flag if project description is blank


In [83]:
ck_col = 'Was Project Location/Description Completed?'

def ck_project_description(df):
    if pd.isna(df['Project Location/Description']) or df['Project Location/Description'] == '':
        return 'Project Location/Description can not empty.'
    else:
        return 'OK'
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_project_description, axis = 1)

### check pm validation

In [84]:
ck_col = 'Are all Project Locations with a Valid Postmile?'

df_Minor_pm_invalid = df_Minor_pm_check[df_Minor_pm_check['Valid PM'] != 'Yes']

AMT_IDs_withInvalidPM = df_Minor_pm_invalid['AMT_ID'].unique()

def ck_invalid_pm(df):
    if df['AMT_ID'] in AMT_IDs_withInvalidPM:
        return 'The PM is invalid.'
    else:
        return 'OK'
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_invalid_pm, axis = 1)

In [85]:
# df_Minor_raw_data

In [86]:

ck_col = 'Data Needs Review Other Than Close-out?'

datacheck_cols = [
    'Was Project ID (EFIS) completed?',
    'Is EFIS Repeated in Minor Profile?',
    'Is EA Repeated in Minor Profile?',
    'Does EA Need Updates?',
    'Does FY Need Updates?', 
    'Is the project with FY In Use before 2023 included in the approved list?',
    'Does Program Code Need Updates?', 
    'Does Minor Type(A:B) Match Approved Project List?',
    'Does project have at least one performance activity related to the Activity Category?',
    'Does Construction Capital Cost ($K) Need Updates?', 
    'Was Performance Tab Completed in Section In Use?',
    'For approved Minor Projects, does the performance summary match the performance value and measure reported to CTC?',
    'Was the Project Cost Completed?', 
    'Was Project Location/Description Completed?', 
    'Are all Project Locations with a Valid Postmile?',
    'Is Pavement Plan Year Matching Project FY?',
    'Is TMS Plan Year Matching Project FY?',
    # 'Project in the approved list, but not in the AMTool.', 
]

def ck_review_needs_2(df, datacheck_cols):
    for col in datacheck_cols:
        if pd.notna(df[col]) and df[col] != 'OK':
            return 'District needs to review project data (Profile and/or RTL)'
    return 'OK'
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_review_needs_2, args = [datacheck_cols], axis=1)

In [87]:
# #DEBUG
# # AMT_ID = 'MB000370' 
# # AMT_ID = 'MB001220' 
# AMT_ID = 'MB000490' 
# ck_col = 'Data Needs Review Other Than Close-out?'

# df_Minor_raw_data[(df_Minor_raw_data['AMT_ID'] == AMT_ID) ][['AMT_ID', 'FY In Use', 
#                                                              'Was Project ID (EFIS) completed?',
#     'Is EFIS Repeated in Minor Profile?',
#     'Is EA Repeated in Minor Profile?',
#     'Does EA Need Updates?',
#     'Does FY Need Updates?', 
#     'Is the project with FY In Use before 2023 included in the approved list?',
#     'Does Program Code Need Updates?', 
#     'Does Minor Type(A:B) Match Approved Project List?',
#     'Does project have at least one performance activity related to the Activity Category?',
#     'Does Construction Capital Cost ($K) Need Updates?', 
#     'Was Performance Tab Completed in Section In Use?',
#     'For approved Minor Projects, does the performance summary match the performance value and measure reported to CTC?',
#     'Was the Project Cost Completed?', 
#     'Was Project Location/Description Completed?', 
#     'Are all Project Locations with a Valid Postmile?',
#     'Is Pavement Plan Year Matching Project FY?',
#     'Is TMS Plan Year Matching Project FY?', ck_col]]

In [88]:
ck_col = 'Was project with FY Before 2021/22 Closed-Out?'

def ck_project_closeout_status(df):
    if pd.isna(df['FY In Use']) or df['FY In Use'] == 0:
        return 'OK'
    elif int(df['FY In Use']) < 22 and df['Section'] == 'ALN':
        return 'OK'
    elif int(df['FY In Use']) > 21:
        return 'OK'
    else:
        return 'Please work with HQ Minor Program to Close-out Project'
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_project_closeout_status, axis=1)

In [89]:
#DEBUG
AMT_ID = 'MB000370' 
AMT_ID = 'MB001220' 
AMT_ID = 'MA000084' 
ck_col = 'Was project with FY Before 2021/22 Closed-Out?'

df_Minor_raw_data[(df_Minor_raw_data['AMT_ID'] == AMT_ID) ][['AMT_ID', 'FY In Use', ck_col]]

Unnamed: 0,AMT_ID,FY In Use,Was project with FY Before 2021/22 Closed-Out?
71,MA000084,99,OK


In [90]:
ck_col = 'Data Needs Review?'

input_cols = [
    'Data Needs Review Other Than Close-out?',
    'Was project with FY Before 2021/22 Closed-Out?'
]

def ck_review_needs(df, input_cols):
    for col in input_cols:
        if pd.notna(df[col]) and df[col] != 'OK:':
            return 'District needs to review project data (Profile and/or RTL)'
    return 'OK'
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_review_needs, args = [input_cols], axis=1)


In [91]:
datacheck_cols_all = datacheck_cols + ['Was project with FY Before 2021/22 Closed-Out?']

<a id='Export_Data'></a>
# Export Data

In [92]:
file_export_log = open(LOG_FILE, "a")  # append mode
file_export_log.write("#####Minor Data Check:{} \n".format(Data_TimeStamp))
file_export_log.close()

In [93]:
df_Minor_raw_data['Data_TimeStamp'] = Data_TimeStamp

In [94]:
# for c in df_Minor_raw_data.columns:
#     print(c)

## Export Check Flags

In [95]:
#export all projects with all checks in matrix
project_info_cols = [
    #project information
    'AMT_ID', 'Minor', 'EFIS', 'EA', 'District', 
    'Data_TimeStamp',
    'Data Needs Review?',
    'Data Needs Review Other Than Close-out?',
    'FY of the Approved List',
    'FY In Use',
    'Section In Use',
    'Program Code In Use', 
    'Activity Category In Use',
    'Project Cost In Use ($K)'
]

out_cols = project_info_cols + datacheck_cols_all

filename = 'Minor_Datacheck_Matrix'
df_out = df_Minor_raw_data[out_cols]

uf.export_csv(df_out, filename, PROJECTBOOKCHECK_HTTPSERVER_FOLDER, LOG_FILE)

In [96]:
# for c in datacheck_cols_all:
#     if df_Minor_raw_data[c].isna().any():
#         print(c)
# #     df_Minor_raw_data[c].fillna('OK', inplace=True)

In [97]:
df_melted = pd.melt(df_Minor_raw_data, 
                    id_vars=['AMT_ID'], 
                    value_vars=datacheck_cols_all, var_name = 'Check Description')

df_melted.columns = ['AMT_ID','Check Description','Check Summary']
df_melted_filtered = df_melted[df_melted['Check Summary']!= 'OK']

df_punchlist = pd.merge(df_melted_filtered, df_Minor_raw_data[project_info_cols],
                  how = 'left', left_on = 'AMT_ID', right_on = 'AMT_ID')


In [98]:
# df_Minor_raw_data.AMT_ID.nunique()

# Get district level summary of projects w/o data check flags

In [99]:
temp = df_punchlist[df_punchlist['FY In Use'] != 99].groupby(['District'])['AMT_ID'].nunique().reset_index()

In [100]:
dict_rename = {
    'AMT_ID': 'Number of Projects with Data Check Flag'
}
temp.rename(dict_rename, axis = 1, inplace=True)    

In [101]:
temp2 = df_Minor_raw_data.groupby(['District'])['AMT_ID'].nunique().reset_index()

In [102]:
dict_rename = {
    'AMT_ID': 'Number of Projects in Minor'
}
temp2.rename(dict_rename, axis = 1, inplace=True)

In [103]:
df_datacheck_summary = pd.merge(temp, temp2, how ='left', left_on = 'District' , right_on = 'District') 

In [104]:
df_datacheck_summary['Data_TimeStamp'] = Data_TimeStamp

In [105]:
filename = 'Minor_DataCheckSummary'

#This summary is based on EFIS, includes the projects that are in approved list but not yet in AMTool.

df_out = df_datacheck_summary
uf.export_csv(df_out, filename, PROJECTBOOKCHECK_HTTPSERVER_FOLDER, LOG_FILE)
uf.export_hyper(df_out, filename, LOG_FILE)

processing table: 12it [00:00, 6001.15it/s]


Signing into AssetManagement at https://tableau.dot.ca.gov
Publishing Minor_DataCheckSummary.hyper to Sandbox_ProjectBookCheck_Automation...


In [106]:
# #question to be answered: Will the projects in the approved list but not in the AMTool be counted?

#append the missing AMTool from the approved list
# df_Missing_projects['']
df_Missing_projects['Check Description'] = 'Project in the approved list, but not in the AMTool.'
df_Missing_projects['Check Summary'] =  'The project is in the approved list but not in AMTool.'
df_punchlist_all = df_punchlist.append(df_Missing_projects, ignore_index = True)
# df_punchlist_all['EA'].fillna('_', inplace=True)
df_punchlist_all['EA'] = df_punchlist_all['EA'].apply(lambda x: "'"+ str(x) if x else "'")

filename = 'Minor_Datacheck_Punchlist'
uf.export_csv(df_punchlist_all, filename, PROJECTBOOKCHECK_HTTPSERVER_FOLDER, LOG_FILE)
uf.export_hyper(df_punchlist_all, filename, LOG_FILE)

processing table: 2064it [00:00, 12899.61it/s]


Signing into AssetManagement at https://tableau.dot.ca.gov
Publishing Minor_Datacheck_Punchlist.hyper to Sandbox_ProjectBookCheck_Automation...


## Action items for Minor District Engineer

In [107]:
ck_col = 'Was information Entered in the Allocation Band?'

def ck_ALN_band_info_completeness(df):
    if (pd.isna(df['FY_ALN']) 
        or df['Was performance tab completed in ALN Band?'] == 'No' 
        or pd.isna(df['Total Capital Project Cost ($K)'])
       ):
        return 'No'
    else: 
        return 'Yes'
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_ALN_band_info_completeness, axis=1)

In [108]:
ck_col = 'Is Project ready to enter data in the Allocation Band?'

def ck_readiness_to_enter_ALN_band(df):
    if pd.notna(df['Prog Appr Date_WP']):  #has approval date in WP band
        if df['Section'] == 'ALN':
            return 'Project was closed-out'
        elif df['Was information Entered in the Allocation Band?'] == 'Yes':
            return 'Allocation Band needs review by HQ Minor Program. If all data Accurate HQ Minor will enter the approval date'
        else:
            return 'Project ready to enter data in the Allocation Band (Cost, Schedule, RTL, And/Or Performance Tab)'
    else: 
        return 'Workplan Band needs review by HQ Minor Program. If all data Accurate HQ Minor will enter the approval date'
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_readiness_to_enter_ALN_band, axis=1)

In [109]:
# #UnitTest

# AMT_IDs = ['MB000633', 'MB000092']
# ck_col = 'Is Project ready to enter data in the Allocation Band?'

# df_Minor_raw_data[(df_Minor_raw_data['AMT_ID'].isin(AMT_IDs)) ][[
#     'AMT_ID','Section', 'Prog Appr Date_WP','Was information Entered in the Allocation Band?', ck_col]]

In [110]:
filename = 'Minor_District_Action'

out_cols = project_info_cols + [
    'Is Project ready to enter data in the Allocation Band?'
                    ]
df_out = df_Minor_raw_data[out_cols]
uf.export_csv(df_out, filename, PROJECTBOOKCHECK_HTTPSERVER_FOLDER, LOG_FILE)

uf.export_hyper(df_out, filename, LOG_FILE)

processing table: 1273it [00:00, 13989.40it/s]


Signing into AssetManagement at https://tableau.dot.ca.gov
Publishing Minor_District_Action.hyper to Sandbox_ProjectBookCheck_Automation...


## Action items for Minor HQ Engineer

In [111]:
ck_col = 'Is Project Ready for Review and Approval Date?'

def ck_readiness_for_review(df):
    if df['Data Needs Review Other Than Close-out?'] != 'OK':
        return 'No'
    elif df['FY In Use'] > 22: 
        return 'No'
    elif pd.isna(df['Prog Appr Date_WP']):
        return 'HQ Needs to review Workplan band and enter Approval Date if data is accurate'
    
    elif df['Was information Entered in the Allocation Band?'] == 'Yes':
        if pd.notna(df['Prog Appr Date_ALN']):
            return 'No'
        else:
            return 'HQ Needs to review Allocation band and enter Approval Date if data is accurate'
    else:
        return 'No'
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_readiness_for_review, axis=1)

In [112]:
#UnitTest

AMT_IDs = ['MA000008', 'MB000490']
ck_col = 'Is Project Ready for Review and Approval Date?'

df_Minor_raw_data[(df_Minor_raw_data['AMT_ID'].isin(AMT_IDs)) ][[
    'AMT_ID','Section', 
    'Data Needs Review Other Than Close-out?',
    'FY In Use','Prog Appr Date_WP','Was information Entered in the Allocation Band?','Prog Appr Date_ALN',
    ck_col,]]

Unnamed: 0,AMT_ID,Section,Data Needs Review Other Than Close-out?,FY In Use,Prog Appr Date_WP,Was information Entered in the Allocation Band?,Prog Appr Date_ALN,Is Project Ready for Review and Approval Date?
7,MA000008,WP,District needs to review project data (Profile...,20,,No,,No
404,MB000490,WP,OK,21,07/01/20,Yes,,HQ Needs to review Allocation band and enter A...


In [113]:
ck_col = 'Does Workplan Band needs Approval Removal?'

def ck_WP_data_error(df):
    if pd.isna(df['Prog Appr Date_WP']):  #has no approval data in WP band
        return 'No'
    elif (pd.isna(df['FY In Use'])
        or int(df['FY In Use']) > 22 
        or (df['FY In Use'] in ['21', '22'] and df['In the Approved List?'] == 'No')
         ): 
        return 'HQ Minor Program needs to remove Approval date fromWorkplan Band, so District can updated the project FY. Project not in Approved lists or in the future'
    else:
        return 'No'
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_WP_data_error, axis=1)

In [114]:
ck_col = 'Does Allocation Band Needs Approval Removal?'

def ck_ALN_data_error(df):
    if pd.isna(df['Prog Appr Date_ALN']):  #has no approval data in ALN band
        return 'No'
    elif df['Data Needs Review Other Than Close-out?'] == 'OK': 
        return 'No'
    else:
        return 'HQ Minor Program needs to remove Approval date from Allocation Band, so District can updated the project data'

df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_ALN_data_error, axis=1)

In [115]:
ck_col = 'HQ Minor Program Needs Review?'

def ck_review_needs_HQ_Minor(df):
    if (df['Is Project Ready for Review and Approval Date?'] == "No"
        and df['Does Workplan Band needs Approval Removal?'] == 'No'
        and df['Does Allocation Band Needs Approval Removal?'] == 'No'
       ):
        return 'No'
    else: 
        return "HQ Minor Needs Review"
    
df_Minor_raw_data[ck_col] = df_Minor_raw_data.apply(ck_review_needs_HQ_Minor, axis=1)

In [120]:
# #UnitTest

# AMT_IDs = ['MA000008', 'MB000490']

# df_Minor_raw_data[(df_Minor_raw_data['AMT_ID'].isin(AMT_IDs)) ][[
#     'AMT_ID','Section', 
#     'Data Needs Review Other Than Close-out?',
#     'FY In Use','Prog Appr Date_WP','Was information Entered in the Allocation Band?','Prog Appr Date_ALN',
#     'Is Project Ready for Review and Approval Date?','Does Workplan Band needs Approval Removal?', 'Does Allocation Band Needs Approval Removal?','HQ Minor Program Needs Review?']]

Unnamed: 0,AMT_ID,Section,Data Needs Review Other Than Close-out?,FY In Use,Prog Appr Date_WP,Was information Entered in the Allocation Band?,Prog Appr Date_ALN,Is Project Ready for Review and Approval Date?,Does Workplan Band needs Approval Removal?,Does Allocation Band Needs Approval Removal?,HQ Minor Program Needs Review?
7,MA000008,WP,District needs to review project data (Profile...,20,,No,,No,No,No,No
404,MB000490,WP,OK,21,07/01/20,Yes,,HQ Needs to review Allocation band and enter A...,No,No,HQ Minor Needs Review


In [117]:
filename = 'Minor_HQ_Action'

out_cols = project_info_cols + [
    'Is Project Ready for Review and Approval Date?',
    'Does Workplan Band needs Approval Removal?',
    'Does Allocation Band Needs Approval Removal?',
    'HQ Minor Program Needs Review?',
                    ]
df_out = df_Minor_raw_data[out_cols]

uf.export_csv(df_out, filename, PROJECTBOOKCHECK_HTTPSERVER_FOLDER, LOG_FILE)

uf.export_hyper(df_out, filename, LOG_FILE)

processing table: 1273it [00:00, 12122.98it/s]


Signing into AssetManagement at https://tableau.dot.ca.gov
Publishing Minor_HQ_Action.hyper to Sandbox_ProjectBookCheck_Automation...



<a id='FinalCleanUp'></a>
## Final Clean Up

In [118]:
#clean up tableau publishing log file

import os
import glob
# get a recursive list of file paths that matches pattern
fileList = glob.glob('./*.log')
# Iterate over the list of filepaths & remove each file.
for filePath in fileList:
    try:
        os.remove(filePath)
    except OSError:
        print("Error while deleting file")


In [119]:
end_time =  time.time()
elapsed = end_time - start_time
print('time elapsed : {} seconds'.format(elapsed))

time elapsed : 22.070443630218506 seconds
