In [1]:
# chavez & coombs 
# 13976(v1).ipynb
# 2020-08-27
# data management for deltek 13976

Steps:
1. Download the deltek data as an Excel file
2. Save 'AP2 Regular & OT Hours for Adv by CCC_#505ap.xlsx' on U:\
3. Run the code below

In [2]:
# to determine conda version or conda and python versions, from a cmd window at the ">" prompt:
# > conda -V
#   conda 4.2.9
# > python --version
#   Python 3.5.2 :: Anaconda 4.2.0 (64-bit)  

In [3]:
# import modules
import pandas
import os
import sys
import gc

In [4]:
# current working directory
print(os.getcwd())

\\ain2\dfsroot\userdata48\willitc0\Data


"read_csv() vs read_excel() in pandas: When to use which and why" by Ashwin A. Vardhan

Have you ever wondered if that excel file that you have, can be made to read faster instead of sitting idle for 10 minutes while your code scans through it? I’ll try to answer it here with a personal experience of mine.

So, I was performing some operations on an excel file using pandas, whose dimensions were 509579 x 240, and it had a size of 295 MB. However, reading that file took about 528 seconds (average over 10 iterations), whereas, on converting it to csv and reading it (using pandas) took just 13 seconds (again, average over 10 iterations), which is an improvement of about 40 times. As you can observe, the file was too huge, and read_excel is just slower in performance. This has been mentioned on stackoverflow too.

So, when will you actually need to convert an excel file to csv before processing it? Usually, if your excel file is small (~100,000 rows, ~50 columns), there won’t be much of a performance issue, unless, you need to run that process very frequently, because, that small delay may get compunded and bite you! But, if your excel file is massive, and you need to process it frequently (or not), it’s better to first convert the excel into a csv, and voila! see the magic happen.

In [5]:
# read data
# df = pd.read_csv (r'Path where the CSV file is stored\File name.csv')
data = pandas.read_excel (r'U:\AP2 Regular & OT Hours for Adv by CCC_#505ap.xlsx')

In [6]:
# Pandas has the following data types:
#    object
#    int64
#    float64
#    bool
#    datetime64
#    timedelta[ns]
#    category

In [7]:
# check metadata
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231795 entries, 0 to 231794
Data columns (total 23 columns):
User                 231795 non-null object
User Id              231795 non-null object
Work Date            231795 non-null datetime64[ns]
Month                231795 non-null datetime64[ns]
Description          231795 non-null object
REF ID               231795 non-null int64
CLIN                 231795 non-null object
WBS                  39929 non-null object
TASK                 231795 non-null object
TPID                 231795 non-null object
ProjNo               231795 non-null object
WB2                  231795 non-null object
Project Id           231795 non-null object
Pay Type             231795 non-null object
Corporation          231795 non-null object
Company Name         231795 non-null object
Approver             231795 non-null object
Status               231795 non-null object
Straight             231795 non-null float64
Overtime             231795 non-null f

In [8]:
# trim data
df = pandas.DataFrame(data, columns= ['User','Work Date', 'Description', 'TPID', 'Straight', 'Overtime', 'Total'])
df.head(2)

Unnamed: 0,User,Work Date,Description,TPID,Straight,Overtime,Total
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,13976.00.01.0111.999,0.0,0.0,0.0
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,13976.00.01.0111.999,0.0,0.0,0.0


In [9]:
# create option year column based on work date
# df1['date'] = pandas.to_datetime(df1['date'])
oy0_mask = (df['Work Date']>='2009-08-03') & (df['Work Date']<='2010-08-02')
oy1_mask = (df['Work Date']>='2010-08-03') & (df['Work Date']<='2011-08-02')
oy2_mask = (df['Work Date']>='2011-08-03') & (df['Work Date']<='2012-08-02')
oy3_mask = (df['Work Date']>='2012-08-03') & (df['Work Date']<='2013-08-02')
oy4_mask = (df['Work Date']>='2013-08-03') & (df['Work Date']<='2014-08-02')
oy5_mask = (df['Work Date']>='2014-08-03') & (df['Work Date']<='2015-08-02')
oy6_mask = (df['Work Date']>='2015-08-03') & (df['Work Date']<='2016-08-02')
oy7_mask = (df['Work Date']>='2016-08-03') & (df['Work Date']<='2017-08-02')
oy8_mask = (df['Work Date']>='2017-08-03') & (df['Work Date']<='2018-08-02')
oy9_mask = (df['Work Date']>='2018-08-03') & (df['Work Date']<='2019-08-02')
bridge_mask = (df['Work Date']>='2019-08-03')

df.loc[oy0_mask, 'OptYr'] = 'OY0'
df.loc[oy1_mask, 'OptYr'] = 'OY1'
df.loc[oy2_mask, 'OptYr'] = 'OY2'
df.loc[oy3_mask, 'OptYr'] = 'OY3'
df.loc[oy4_mask, 'OptYr'] = 'OY4'
df.loc[oy5_mask, 'OptYr'] = 'OY5'
df.loc[oy6_mask, 'OptYr'] = 'OY6'
df.loc[oy7_mask, 'OptYr'] = 'OY7'
df.loc[oy8_mask, 'OptYr'] = 'OY8'
df.loc[oy9_mask, 'OptYr'] = 'OY9'
df.loc[bridge_mask, 'OptYr'] = 'OY.'

df

Unnamed: 0,User,Work Date,Description,TPID,Straight,Overtime,Total,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,13976.00.01.0111.999,0.0,0.0,0.0,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,13976.00.01.0111.999,0.0,0.0,0.0,OY.
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,13976.00.01.0111.999,8.0,0.0,8.0,OY.
3,"SCHNEIDER, LAURA E",2020-08-07,Standby Only - COVID-19,13976.00.01.0111.999,8.0,0.0,8.0,OY.
4,"SCHNEIDER, LAURA E",2020-08-10,Software Services,13976.00.00.0011.509,0.0,0.0,0.0,OY.
5,"SCHNEIDER, LAURA E",2020-08-10,Software Services,13976.00.01.0111.509,8.0,0.0,8.0,OY.
6,"SCHNEIDER, LAURA E",2020-08-24,Software Services,13976.00.01.0111.509,8.0,0.0,8.0,OY.
7,"SCHNEIDER, LAURA E",2020-07-15,Software Services,13976.00.01.0111.509,8.0,0.0,8.0,OY.
8,"SCHNEIDER, LAURA E",2020-07-23,Standby Only - COVID-19,13976.00.01.0111.999,8.0,0.0,8.0,OY.
9,"SCHNEIDER, LAURA E",2020-07-28,Software Services,13976.00.01.0111.509,8.0,0.0,8.0,OY.


In [10]:
# count number of rows associated with different values of id
df['OptYr'].value_counts()

OY.    231787
OY9         8
Name: OptYr, dtype: int64

In [11]:
df[df['OptYr'] == 'OY9']

Unnamed: 0,User,Work Date,Description,TPID,Straight,Overtime,Total,OptYr
8118,"RICE, SCOTT C",2019-08-01,Overhead Req Mgmt,13976.00.00.0012.OH2,0.0,0.0,0.0,OY9
62738,"RONEY-DEMARY, MELISSA W",2019-08-02,Software Services,13976.00.00.0011.509,0.0,0.0,0.0,OY9
105030,"JACKSON, NAKIA",2019-08-02,Software Services,13976.00.00.0011.509,0.0,0.0,0.0,OY9
110635,"BARNES, RONELL",2019-08-02,Software Services,13976.00.00.0011.509,0.0,0.0,0.0,OY9
129997,"MIRANDO, JAMES A",2019-08-01,Warehouse Operations,13976.00.00.0011.516,0.0,0.0,0.0,OY9
130102,"MIRANDO, JAMES A",2019-08-02,Warehouse Operations,13976.00.00.0011.516,0.0,0.0,0.0,OY9
191843,"Sykes, Loyda",2019-08-02,Warehouse Operations,13976.00.00.0011.516,0.0,0.0,0.0,OY9
192029,"Sykes, Loyda",2019-08-01,Warehouse Operations,13976.00.00.0011.516,0.0,0.0,0.0,OY9


In [12]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231795 entries, 0 to 231794
Data columns (total 8 columns):
User           231795 non-null object
Work Date      231795 non-null datetime64[ns]
Description    231795 non-null object
TPID           231795 non-null object
Straight       231795 non-null float64
Overtime       231795 non-null float64
Total          231795 non-null float64
OptYr          231795 non-null object
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 14.1+ MB


In [13]:
# check dataframe dimensions (rows x columns)
df.shape

(231795, 8)

In [14]:
# format data
# round floating numbers to two decimal places in python pandas 
pandas.options.display.float_format = '{:.2f}'.format

In [15]:
# check data types
df.dtypes

User                   object
Work Date      datetime64[ns]
Description            object
TPID                   object
Straight              float64
Overtime              float64
Total                 float64
OptYr                  object
dtype: object

In [16]:
# add clin placeholder column
df.insert(3, 'CLIN', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,TPID,Straight,Overtime,Total,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,,13976.00.01.0111.999,0.0,0.0,0.0,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,,13976.00.01.0111.999,0.0,0.0,0.0,OY.
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,,13976.00.01.0111.999,8.0,0.0,8.0,OY.


In [17]:
# add wbs placeholder column
df.insert(4, 'WBS', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,OY.
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,,,13976.00.01.0111.999,8.0,0.0,8.0,OY.


In [18]:
# add SR placeholder column
df.insert(9, 'SR_temp', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,,,13976.00.01.0111.999,8.0,0.0,8.0,,OY.


In [19]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231795 entries, 0 to 231794
Data columns (total 11 columns):
User           231795 non-null object
Work Date      231795 non-null datetime64[ns]
Description    231795 non-null object
CLIN           231795 non-null object
WBS            231795 non-null object
TPID           231795 non-null object
Straight       231795 non-null float64
Overtime       231795 non-null float64
Total          231795 non-null float64
SR_temp        231795 non-null object
OptYr          231795 non-null object
dtypes: datetime64[ns](1), float64(3), object(7)
memory usage: 19.5+ MB


In [20]:
# column a
df['User'].head(2)

0    SCHNEIDER, LAURA E
1    SCHNEIDER, LAURA E
Name: User, dtype: object

In [21]:
# column b
# modify workdate format
df['Work Date'] = pandas.to_datetime(df['Work Date']).dt.strftime('%Y-%m-%d')
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.


In [22]:
# column c
df['Description'].head(2)

0    Standby Only - COVID-19
1    Standby Only - COVID-19
Name: Description, dtype: object

In [23]:
# column d
# get clin from tpid
df['CLIN'] = data.TPID.str.split(".", expand=True,)[3]
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.


In [24]:
# column e
# get wbs from tpid
df['WBS'] = data.TPID.str.split(".", expand=True,)[4]
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.


In [25]:
# column f
# tpid
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.


In [26]:
# save syntax to modfiy column order
# df = df[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'TPID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr']]

In [27]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231795 entries, 0 to 231794
Data columns (total 11 columns):
User           231795 non-null object
Work Date      231795 non-null object
Description    231795 non-null object
CLIN           231795 non-null object
WBS            231795 non-null object
TPID           231795 non-null object
Straight       231795 non-null float64
Overtime       231795 non-null float64
Total          231795 non-null float64
SR_temp        231795 non-null object
OptYr          231795 non-null object
dtypes: float64(3), object(8)
memory usage: 19.5+ MB


In [28]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 231795 entries, 0 to 231794
Data columns (total 23 columns):
User                 231795 non-null object
User Id              231795 non-null object
Work Date            231795 non-null datetime64[ns]
Month                231795 non-null datetime64[ns]
Description          231795 non-null object
REF ID               231795 non-null int64
CLIN                 231795 non-null object
WBS                  39929 non-null object
TASK                 231795 non-null object
TPID                 231795 non-null object
ProjNo               231795 non-null object
WB2                  231795 non-null object
Project Id           231795 non-null object
Pay Type             231795 non-null object
Corporation          231795 non-null object
Company Name         231795 non-null object
Approver             231795 non-null object
Status               231795 non-null object
Straight             231795 non-null float64
Overtime             231795 non-null f

In [29]:
# what size are some of the objects?
print('data', sys.getsizeof(data))
print('df', sys.getsizeof(df))
print('Straight', sys.getsizeof(df['Straight']))
print('TPID', sys.getsizeof(df['TPID']))

data 252265296
df 130662941
Straight 1854464
TPID 18167751


In [30]:
# delete original dataframe to free up memory
del[data]

In [31]:
# save delete datafrome column syntax
# df = df.drop('Company Name', 1)

In [32]:
gc.collect()

73

In [33]:
# add tpid character count column
df['tpid_length'] = df['TPID'].str.len()
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,,OY.,20


In [34]:
df['tpid_length'].value_counts()

20    191866
28     39929
Name: tpid_length, dtype: int64

In [35]:
# where tpid_length = 20
df[df['tpid_length'] == 20].head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,,OY.,20


In [36]:
# where tpid_length = 28
df[df['tpid_length'] == 28].head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
69,"CHERRY, CHRISTOPHER H",2020-08-24,Installation-Fixed Price,114,0,13976.00.01.0114.000.5913720,8.0,0.0,8.0,,OY.,28
2427,"TIMMONS, CARL A",2019-08-22,Installation-Fixed Price,14,0,13976.00.00.0014.000.5522685,8.0,0.0,8.0,,OY.,28
2428,"TIMMONS, CARL A",2019-09-12,Installation-Fixed Price,14,0,13976.00.00.0014.000.5364379,8.0,0.0,8.0,,OY.,28


In [37]:
# filter on tpid_length
df20 = df[df['tpid_length'] == 20]
df20.shape

(191866, 12)

In [38]:
# filter on tpid_length
df28 = df[df['tpid_length'] == 28]
df28.shape

(39929, 12)

In [39]:
df.shape

(231795, 12)

In [40]:
del[df]
gc.collect()

398

In [41]:
df20.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,,OY.,20


In [42]:
# SR column
df20.insert(10, 'SR', 'SR-0000000')
df20.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,SR-0000000,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,SR-0000000,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,,SR-0000000,OY.,20


In [43]:
# delete SR_temp column
df20 = df20.drop('SR_temp', 1)
df20.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,SR-0000000,OY.,20


In [44]:
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
69,"CHERRY, CHRISTOPHER H",2020-08-24,Installation-Fixed Price,114,0,13976.00.01.0114.000.5913720,8.0,0.0,8.0,,OY.,28
2427,"TIMMONS, CARL A",2019-08-22,Installation-Fixed Price,14,0,13976.00.00.0014.000.5522685,8.0,0.0,8.0,,OY.,28
2428,"TIMMONS, CARL A",2019-09-12,Installation-Fixed Price,14,0,13976.00.00.0014.000.5364379,8.0,0.0,8.0,,OY.,28


In [45]:
# get sr from tpid
df28['SR_no'] = df28.TPID.str.split(".", expand=True,)[5]
df28.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length,SR_no
69,"CHERRY, CHRISTOPHER H",2020-08-24,Installation-Fixed Price,114,0,13976.00.01.0114.000.5913720,8.0,0.0,8.0,,OY.,28,5913720
2427,"TIMMONS, CARL A",2019-08-22,Installation-Fixed Price,14,0,13976.00.00.0014.000.5522685,8.0,0.0,8.0,,OY.,28,5522685
2428,"TIMMONS, CARL A",2019-09-12,Installation-Fixed Price,14,0,13976.00.00.0014.000.5364379,8.0,0.0,8.0,,OY.,28,5364379


In [46]:
df28.insert(10, 'SR_prefix', 'SR-')
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,SR_prefix,OptYr,tpid_length,SR_no
69,"CHERRY, CHRISTOPHER H",2020-08-24,Installation-Fixed Price,114,0,13976.00.01.0114.000.5913720,8.0,0.0,8.0,,SR-,OY.,28,5913720
2427,"TIMMONS, CARL A",2019-08-22,Installation-Fixed Price,14,0,13976.00.00.0014.000.5522685,8.0,0.0,8.0,,SR-,OY.,28,5522685
2428,"TIMMONS, CARL A",2019-09-12,Installation-Fixed Price,14,0,13976.00.00.0014.000.5364379,8.0,0.0,8.0,,SR-,OY.,28,5364379


In [47]:
df28['SR'] = df28['SR_prefix'] + df28['SR_no']
df28.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,SR_prefix,OptYr,tpid_length,SR_no,SR
69,"CHERRY, CHRISTOPHER H",2020-08-24,Installation-Fixed Price,114,0,13976.00.01.0114.000.5913720,8.0,0.0,8.0,,SR-,OY.,28,5913720,SR-5913720
2427,"TIMMONS, CARL A",2019-08-22,Installation-Fixed Price,14,0,13976.00.00.0014.000.5522685,8.0,0.0,8.0,,SR-,OY.,28,5522685,SR-5522685
2428,"TIMMONS, CARL A",2019-09-12,Installation-Fixed Price,14,0,13976.00.00.0014.000.5364379,8.0,0.0,8.0,,SR-,OY.,28,5364379,SR-5364379


In [48]:
# delete SR_temp column
df28 = df28.drop('SR_temp', 1)
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_prefix,OptYr,tpid_length,SR_no,SR
69,"CHERRY, CHRISTOPHER H",2020-08-24,Installation-Fixed Price,114,0,13976.00.01.0114.000.5913720,8.0,0.0,8.0,SR-,OY.,28,5913720,SR-5913720
2427,"TIMMONS, CARL A",2019-08-22,Installation-Fixed Price,14,0,13976.00.00.0014.000.5522685,8.0,0.0,8.0,SR-,OY.,28,5522685,SR-5522685
2428,"TIMMONS, CARL A",2019-09-12,Installation-Fixed Price,14,0,13976.00.00.0014.000.5364379,8.0,0.0,8.0,SR-,OY.,28,5364379,SR-5364379


In [49]:
# delete SR_prefix column
df28 = df28.drop('SR_prefix', 1)
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,OptYr,tpid_length,SR_no,SR
69,"CHERRY, CHRISTOPHER H",2020-08-24,Installation-Fixed Price,114,0,13976.00.01.0114.000.5913720,8.0,0.0,8.0,OY.,28,5913720,SR-5913720
2427,"TIMMONS, CARL A",2019-08-22,Installation-Fixed Price,14,0,13976.00.00.0014.000.5522685,8.0,0.0,8.0,OY.,28,5522685,SR-5522685
2428,"TIMMONS, CARL A",2019-09-12,Installation-Fixed Price,14,0,13976.00.00.0014.000.5364379,8.0,0.0,8.0,OY.,28,5364379,SR-5364379


In [50]:
# delete SR_no column
df28 = df28.drop('SR_no', 1)
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,OptYr,tpid_length,SR
69,"CHERRY, CHRISTOPHER H",2020-08-24,Installation-Fixed Price,114,0,13976.00.01.0114.000.5913720,8.0,0.0,8.0,OY.,28,SR-5913720
2427,"TIMMONS, CARL A",2019-08-22,Installation-Fixed Price,14,0,13976.00.00.0014.000.5522685,8.0,0.0,8.0,OY.,28,SR-5522685
2428,"TIMMONS, CARL A",2019-09-12,Installation-Fixed Price,14,0,13976.00.00.0014.000.5364379,8.0,0.0,8.0,OY.,28,SR-5364379


In [51]:
# standardize column order
df20 = df20[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'TPID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr', 'tpid_length']]
df28 = df28[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'TPID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr', 'tpid_length']]

In [52]:
# concatenate vertical dataframes
df = pandas.concat([df20, df28], axis=0)

In [53]:
df20.shape

(191866, 12)

In [54]:
df28.shape

(39929, 12)

In [55]:
# check columns
df.shape

(231795, 12)

In [56]:
del[df20, df28]

In [57]:
gc.collect()

308

In [58]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 231795 entries, 0 to 231751
Data columns (total 12 columns):
User           231795 non-null object
Work Date      231795 non-null object
Description    231795 non-null object
CLIN           231795 non-null object
WBS            231795 non-null object
TPID           231795 non-null object
Straight       231795 non-null float64
Overtime       231795 non-null float64
Total          231795 non-null float64
SR             231795 non-null object
OptYr          231795 non-null object
tpid_length    231795 non-null int64
dtypes: float64(3), int64(1), object(8)
memory usage: 23.0+ MB


In [59]:
# data check
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,SR-0000000,OY.,20


In [60]:
# delete tpid_length column
df = df.drop('tpid_length', 1)
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,SR-0000000,OY.


In [61]:
df.to_excel (r'U:\AP2 Regular & OT Hours for Adv by CCC_#505ap-modified.xlsx', index = False, header=True)