#### Merge three files to create the file to be analyzed. File "LaborRSCCost" contains dollars planned and actual by Job. "jobAttributes" contains an expanded set of attributes (features). "Planning" has a few more attributes. Actual dollars will serve as the Label

In [10]:
import pandas as pd
import numpy as np
from getConfig  import getConfig

In [11]:
# Load the data
config = getConfig()
# This file has the Plan and Actual data
jobs = pd.read_csv(config["dataLoc"]+"LaborRSCCost.csv")

In [12]:
def processPlan(df, col):
    keep = ['Division', 'Program', 'Planning Job', 'RSC', 'Pool', 'ARG']
    keep.append(col)
    df = df[keep]
    rename = ['Div', 'Program', 'Job', 'RSC', 'Pool', 'ARG', 'Plan']
    df.columns = rename
    return df

In [13]:
# We only will be using the "Actual" column; the rest are keys to join with Plan
def processActual(df, col):
    keep = ['Planning Job', 'RSC', 'Pool', 'ARG']
    keep.append(col)
    df = df[keep]
    rename = ['Job', 'RSC', 'Pool', 'ARG', 'Actual']
    df.columns = rename
    return df

In [14]:
# Merge Plan and Actual by unique key
def mergeDFs(plan, actual):
    idx = ['Job',"RSC", 'Pool', 'ARG']
    plan.set_index(idx, inplace=True)
    actual.set_index(idx, inplace=True)
    return (pd.merge(plan, actual, how='inner', left_index=True, right_index=True))

##### Match up the Plan and Actual in one record

In [15]:
# Get Plan and Actual for each of 2016 and 2017
dfList = []

# 2016 first
plan = jobs.loc[jobs["Scenario"]=="LRBP2016_Final"]
plan = processPlan(plan," 2016 12 YTD ")
plan["Year"] = 2016
# Actual
actual = jobs.loc[jobs["Scenario"]=="LRBP2017_Final"]
actual = processActual(actual, " 2016 12 YTD ")
dfList.append(mergeDFs(plan, actual))

# Now 2017
plan = jobs.loc[jobs["Scenario"]=="LRBP2017_Final"]
plan = processPlan(plan," 2017 12 YTD ")
plan["Year"] = 2017
# Actual
actual = jobs.loc[jobs["Scenario"]=="LRBP2018_Final"]
actual = processActual(actual, " 2017 12 YTD ")
dfList.append(mergeDFs(plan, actual))

merge1 = pd.concat(dfList)

##### Append more Job attributes

In [16]:
merge1.reset_index(inplace=True)
merge1.set_index("Job", inplace=True)

In [17]:
attributes = pd.read_csv(config["dataLoc"]+"jobAttributes.csv")

keep = ["Planning Job", "Category", "Tool Functionality", "Parent", "Work Site", "Divisional Support Code",\
       "Authorization Status", "Customer Type", "Country", "BD Status", "Market Segment",\
       "End User Service", "Contract Type", "Major EAC", "Income Statement Type", "Calc Type",\
       "Revenue Calc Type", "Auto Earn Adj", "Balance Sheet Type", "Award Fee", "ROC", "ROS",\
        "Bow Wave"]
attributes = attributes[keep]

attributes.set_index("Planning Job", inplace=True)

# For each Job, append the attributes
before = merge1.shape[0]
merge2 = merge1.merge(attributes, left_index=True, right_index=True )
after = merge2.shape[0]
print("{} jobs not found".format(before-after))

0 jobs not found


##### More Job attributes. This file is year-specific so make that part of the key

In [18]:
attributes = pd.read_csv(config["dataLoc"]+"Planning-formatted.csv")

In [19]:
# Column name for Job was lost during the merge so set it
merge2.index.set_names(['Job'], inplace=True)

merge2.reset_index(inplace=True)
merge2.set_index(["Job", "Year"], inplace=True)

In [20]:
# Set same index for these new attributes
attributes.set_index(["Job", "Year"], inplace=True)

# For each Job/Year, get the attributes
before = merge2.shape[0]
final = merge2.merge(attributes, left_index=True, right_index=True )
after = final.shape[0]
print("{} jobs not found".format(before-after))

648 jobs not found


In [21]:
final.reset_index(inplace=True)

In [22]:
# Move the Actual to the end (the label)
label = "Actual"
cols = [col for col in final.columns if col not in [label]]
cols.append(label)
final = final[cols]

In [23]:
final.to_csv(config["dataLoc"]+"final.csv", index=False)