In [1]:
# chavez & coombs 
# cognos_data_management_613976(v1).ipynb
# 2020-10-14
# this script makes cognos-output data compatible with chavez-created analytic processes

Steps:
1. Download the deltek data as an Excel file
2. Save 'AP2 Regular & OT Hours for Adv by CCC_#505ap.xlsx' as a CSV file
3. Run the code below

In [2]:
# to determine conda version or conda and python versions, from a cmd window at the ">" prompt:
# > conda -V
#   conda 4.2.9
# > python --version
#   Python 3.5.2 :: Anaconda 4.2.0 (64-bit)  

In [3]:
# import modules
import pandas
import os
import sys
import gc
# import numpy
# import matplotlib
# import pylab

In [4]:
# current working directory
print(os.getcwd())

\\ain2\dfsroot\userdata48\willitc0\Data


"read_csv() vs read_excel() in pandas: When to use which and why" by Ashwin A. Vardhan

Have you ever wondered if that excel file that you have, can be made to read faster instead of sitting idle for 10 minutes while your code scans through it? I’ll try to answer it here with a personal experience of mine.

So, I was performing some operations on an excel file using pandas, whose dimensions were 509579 x 240, and it had a size of 295 MB. However, reading that file took about 528 seconds (average over 10 iterations), whereas, on converting it to csv and reading it (using pandas) took just 13 seconds (again, average over 10 iterations), which is an improvement of about 40 times. As you can observe, the file was too huge, and read_excel is just slower in performance. This has been mentioned on stackoverflow too.

So, when will you actually need to convert an excel file to csv before processing it? Usually, if your excel file is small (~100,000 rows, ~50 columns), there won’t be much of a performance issue, unless, you need to run that process very frequently, because, that small delay may get compunded and bite you! But, if your excel file is massive, and you need to process it frequently (or not), it’s better to first convert the excel into a csv, and voila! see the magic happen.

In [5]:
# read data
# df = pd.read_csv (r'Path where the CSV file is stored\File name.csv')
data = pandas.read_csv (r'U:\AP2 Regular & OT Hours for Adv by CCC_#505ap.csv', low_memory=False)

Pandas has the following data types:
    object
    int64
    float64
    bool
    datetime64
    timedelta[ns]
    category

In [6]:
# check metadata
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5408 entries, 0 to 5407
Data columns (total 23 columns):
User                 5408 non-null object
User Id              5408 non-null object
Work Date            5408 non-null object
Month                5408 non-null object
Description          5408 non-null object
REF ID               5408 non-null int64
CLIN                 5408 non-null float64
WBS                  826 non-null object
TASK                 5408 non-null object
TPID                 5408 non-null object
ProjNo               5408 non-null int64
WB2                  5408 non-null object
Project Id           5408 non-null object
Pay Type             5408 non-null object
Corporation          5408 non-null object
Company Name         5408 non-null object
Approver             5407 non-null object
Status               5408 non-null object
Straight             5408 non-null float64
Overtime             5408 non-null float64
Total                5408 non-null float64
Date Time 

In [7]:
# trim data
df = pandas.DataFrame(data, columns= ['User','Work Date', 'Description', 'TPID', 'Straight', 'Overtime', 'Total'])
df.head(2)

Unnamed: 0,User,Work Date,Description,TPID,Straight,Overtime,Total
0,"SCHNEIDER, LAURA E","Oct 6, 2020",Software Services,613976.00.01.0111.509,10.0,0.0,10.0
1,"SCHNEIDER, LAURA E","Oct 6, 2020",Software Services,613976.00.01.0111.509,10.0,0.0,10.0


In [8]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5408 entries, 0 to 5407
Data columns (total 7 columns):
User           5408 non-null object
Work Date      5408 non-null object
Description    5408 non-null object
TPID           5408 non-null object
Straight       5408 non-null float64
Overtime       5408 non-null float64
Total          5408 non-null float64
dtypes: float64(3), object(4)
memory usage: 295.8+ KB


In [9]:
# check dataframe dimensions (rows x columns)
df.shape

(5408, 7)

In [10]:
# format data
# round floating numbers to two decimal places in python pandas 
pandas.options.display.float_format = '{:.2f}'.format

In [11]:
# check data types
df.dtypes

User            object
Work Date       object
Description     object
TPID            object
Straight       float64
Overtime       float64
Total          float64
dtype: object

In [12]:
# add clin placeholder column
df.insert(3, 'CLIN', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,TPID,Straight,Overtime,Total
0,"SCHNEIDER, LAURA E","Oct 6, 2020",Software Services,,613976.00.01.0111.509,10.0,0.0,10.0
1,"SCHNEIDER, LAURA E","Oct 6, 2020",Software Services,,613976.00.01.0111.509,10.0,0.0,10.0
2,"SCHNEIDER, LAURA E","Oct 7, 2020",Software Services,,613976.00.01.0111.509,10.0,0.0,10.0


In [13]:
# add wbs placeholder column
df.insert(4, 'WBS', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total
0,"SCHNEIDER, LAURA E","Oct 6, 2020",Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0
1,"SCHNEIDER, LAURA E","Oct 6, 2020",Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0
2,"SCHNEIDER, LAURA E","Oct 7, 2020",Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0


In [14]:
# add SR placeholder column
df.insert(9, 'SR_temp', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp
0,"SCHNEIDER, LAURA E","Oct 6, 2020",Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,
1,"SCHNEIDER, LAURA E","Oct 6, 2020",Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,
2,"SCHNEIDER, LAURA E","Oct 7, 2020",Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,


In [15]:
# add option year column
df.insert(10, 'OptYr', 'OY..')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E","Oct 6, 2020",Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"SCHNEIDER, LAURA E","Oct 6, 2020",Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
2,"SCHNEIDER, LAURA E","Oct 7, 2020",Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [16]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5408 entries, 0 to 5407
Data columns (total 11 columns):
User           5408 non-null object
Work Date      5408 non-null object
Description    5408 non-null object
CLIN           5408 non-null object
WBS            5408 non-null object
TPID           5408 non-null object
Straight       5408 non-null float64
Overtime       5408 non-null float64
Total          5408 non-null float64
SR_temp        5408 non-null object
OptYr          5408 non-null object
dtypes: float64(3), object(8)
memory usage: 464.8+ KB


In [17]:
# column a
df['User'].head(2)

0    SCHNEIDER, LAURA E
1    SCHNEIDER, LAURA E
Name: User, dtype: object

In [18]:
# column b
# modify workdate format
df['Work Date'] = pandas.to_datetime(df['Work Date']).dt.strftime('%Y-%m-%d')
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [19]:
# column c
df['Description'].head(2)

0    Software Services
1    Software Services
Name: Description, dtype: object

In [20]:
# column d
# get clin from tpid
df['CLIN'] = data.TPID.str.split(".", expand=True,)[3]
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [21]:
# column e
# get wbs from tpid
df['WBS'] = data.TPID.str.split(".", expand=True,)[4]
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [22]:
# column f
# tpid
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [23]:
# save syntax to modfiy column order
# df = df[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'TPID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr']]

In [24]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5408 entries, 0 to 5407
Data columns (total 11 columns):
User           5408 non-null object
Work Date      5408 non-null object
Description    5408 non-null object
CLIN           5408 non-null object
WBS            5408 non-null object
TPID           5408 non-null object
Straight       5408 non-null float64
Overtime       5408 non-null float64
Total          5408 non-null float64
SR_temp        5408 non-null object
OptYr          5408 non-null object
dtypes: float64(3), object(8)
memory usage: 464.8+ KB


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5408 entries, 0 to 5407
Data columns (total 23 columns):
User                 5408 non-null object
User Id              5408 non-null object
Work Date            5408 non-null object
Month                5408 non-null object
Description          5408 non-null object
REF ID               5408 non-null int64
CLIN                 5408 non-null float64
WBS                  826 non-null object
TASK                 5408 non-null object
TPID                 5408 non-null object
ProjNo               5408 non-null int64
WB2                  5408 non-null object
Project Id           5408 non-null object
Pay Type             5408 non-null object
Corporation          5408 non-null object
Company Name         5408 non-null object
Approver             5407 non-null object
Status               5408 non-null object
Straight             5408 non-null float64
Overtime             5408 non-null float64
Total                5408 non-null float64
Date Time 

In [26]:
# what size are some of the objects?
print('data', sys.getsizeof(data))
print('df', sys.getsizeof(df))
print('Straight', sys.getsizeof(df['Straight']))
print('TPID', sys.getsizeof(df['TPID']))

data 5986481
df 3018692
Straight 43368
TPID 428536


In [27]:
# delete original dataframe to free up memory
del[data]

In [28]:
# save delete datafrome column syntax
# df = df.drop('Company Name', 1)

In [29]:
gc.collect()

19

In [30]:
# add tpid character count column
df['tpid_length'] = df['TPID'].str.len()
df['tpid_length'].head(3)

0    21
1    21
2    21
Name: tpid_length, dtype: int64

In [31]:
# where tpid_length = 29
df['tpid_length'][df['tpid_length'] == 29].head(3)

128    29
129    29
268    29
Name: tpid_length, dtype: int64

In [32]:
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21


In [33]:
# where tpid_length != 29
df[df['tpid_length'] != 29].head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21
2,"SCHNEIDER, LAURA E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21


In [34]:
# filter on tpid_length
df21 = df[df['tpid_length'] != 29]
df21.shape

(4582, 12)

In [35]:
df21.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21
2,"SCHNEIDER, LAURA E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21


In [36]:
# filter on tpid_length
df29 = df[df['tpid_length'] == 29]
df29.shape

(826, 12)

In [38]:
# check size (should equal zero)
df.shape[0] - (df21.shape[0] + df29.shape[0])

0

In [39]:
# memory management
del[df]
gc.collect()

483

In [40]:
# SR column
df21.insert(10, 'SR', 'SR-0000000')
df21.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,SR-0000000,OY..,21
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,SR-0000000,OY..,21
2,"SCHNEIDER, LAURA E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,SR-0000000,OY..,21


In [41]:
# delete SR_temp column
df21 = df21.drop('SR_temp', 1)
df21.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
2,"SCHNEIDER, LAURA E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21


In [42]:
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
128,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,,OY..,29
129,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,,OY..,29
268,"MOCABEE, KEVIN S",2020-10-05,Installation-Fixed Price,114,0,613976.00.01.0114.000.5853692,0.5,0.0,0.5,,OY..,29


In [43]:
# get sr from tpid
df29['SR_no'] = df29.TPID.str.split(".", expand=True,)[5]
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length,SR_no
128,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,,OY..,29,5808704
129,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,,OY..,29,5808704
268,"MOCABEE, KEVIN S",2020-10-05,Installation-Fixed Price,114,0,613976.00.01.0114.000.5853692,0.5,0.0,0.5,,OY..,29,5853692


In [44]:
df29.insert(10, 'SR_prefix', 'SR-')
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,SR_prefix,OptYr,tpid_length,SR_no
128,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,,SR-,OY..,29,5808704
129,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,,SR-,OY..,29,5808704
268,"MOCABEE, KEVIN S",2020-10-05,Installation-Fixed Price,114,0,613976.00.01.0114.000.5853692,0.5,0.0,0.5,,SR-,OY..,29,5853692


In [45]:
df29['SR'] = df29['SR_prefix'] + df29['SR_no']
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,SR_prefix,OptYr,tpid_length,SR_no,SR
128,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,,SR-,OY..,29,5808704,SR-5808704
129,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,,SR-,OY..,29,5808704,SR-5808704
268,"MOCABEE, KEVIN S",2020-10-05,Installation-Fixed Price,114,0,613976.00.01.0114.000.5853692,0.5,0.0,0.5,,SR-,OY..,29,5853692,SR-5853692


In [46]:
# delete SR_temp column
df29 = df29.drop('SR_temp', 1)
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_prefix,OptYr,tpid_length,SR_no,SR
128,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,SR-,OY..,29,5808704,SR-5808704
129,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,SR-,OY..,29,5808704,SR-5808704
268,"MOCABEE, KEVIN S",2020-10-05,Installation-Fixed Price,114,0,613976.00.01.0114.000.5853692,0.5,0.0,0.5,SR-,OY..,29,5853692,SR-5853692


In [47]:
# delete SR_prefix column
df29 = df29.drop('SR_prefix', 1)
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,OptYr,tpid_length,SR_no,SR
128,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,OY..,29,5808704,SR-5808704
129,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,OY..,29,5808704,SR-5808704
268,"MOCABEE, KEVIN S",2020-10-05,Installation-Fixed Price,114,0,613976.00.01.0114.000.5853692,0.5,0.0,0.5,OY..,29,5853692,SR-5853692


In [48]:
# delete SR_no column
df29 = df29.drop('SR_no', 1)
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,OptYr,tpid_length,SR
128,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,OY..,29,SR-5808704
129,"TIMMONS, CARL A",2020-10-08,Installation-Complex I,112,200,613976.00.01.0112.200.5808704,8.0,0.0,8.0,OY..,29,SR-5808704
268,"MOCABEE, KEVIN S",2020-10-05,Installation-Fixed Price,114,0,613976.00.01.0114.000.5853692,0.5,0.0,0.5,OY..,29,SR-5853692


In [49]:
# standardize column order
df21 = df21[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'TPID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr', 'tpid_length']]
df29 = df29[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'TPID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr', 'tpid_length']]

In [50]:
# concatenate vertical dataframes
df = pandas.concat([df21, df29], axis=0)

In [51]:
df21.shape

(4582, 12)

In [52]:
df29.shape

(826, 12)

In [53]:
# check columns
df.shape

(5408, 12)

In [54]:
# check size (should equal zero)
df.shape[0] - (df21.shape[0] + df29.shape[0])

0

In [55]:
# memory management
del[df21, df29]

In [56]:
gc.collect()

222

In [57]:
# write data to csv (default is U:\Documents)
# df.to_csv('AP2 Regular & OT Hours for Adv by CCC_#505ap-modified.csv')

In [58]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5408 entries, 0 to 5406
Data columns (total 12 columns):
User           5408 non-null object
Work Date      5408 non-null object
Description    5408 non-null object
CLIN           5408 non-null object
WBS            5408 non-null object
TPID           5408 non-null object
Straight       5408 non-null float64
Overtime       5408 non-null float64
Total          5408 non-null float64
SR             5408 non-null object
OptYr          5408 non-null object
tpid_length    5408 non-null int64
dtypes: float64(3), int64(1), object(8)
memory usage: 549.2+ KB


In [59]:
# data check
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
2,"SCHNEIDER, LAURA E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21


In [60]:
# tip 1
# create dataframe with mixed data types
pandas.util.testing.makeMixedDataFrame()

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [61]:
# tip 2
# insert a new column which is a function of another column
dftest = pandas.util.testing.makeMixedDataFrame()
#print(dftest)
dftest.insert(3, 'A2', dftest['A'] * 2)
dftest

Unnamed: 0,A,B,C,A2,D
0,0.0,0.0,foo1,0.0,2009-01-01
1,1.0,1.0,foo2,2.0,2009-01-02
2,2.0,0.0,foo3,4.0,2009-01-05
3,3.0,1.0,foo4,6.0,2009-01-06
4,4.0,0.0,foo5,8.0,2009-01-07


In [62]:
# tip 3
# insert a new column with a constant value in the column with index 3 (i.e., 4)
dftest = pandas.util.testing.makeMixedDataFrame()
dftest.insert(3, 'Country', 'USA')
dftest

Unnamed: 0,A,B,C,Country,D
0,0.0,0.0,foo1,USA,2009-01-01
1,1.0,1.0,foo2,USA,2009-01-02
2,2.0,0.0,foo3,USA,2009-01-05
3,3.0,1.0,foo4,USA,2009-01-06
4,4.0,0.0,foo5,USA,2009-01-07


In [63]:
# tip 4
# change column names
dftest = pandas.util.testing.makeMixedDataFrame()
dftest
dftest.rename(columns={'A':'ID', 'D':'Date'})

Unnamed: 0,ID,B,C,Date
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [64]:
# tip 5
# duplicate a column
dftest = pandas.util.testing.makeMixedDataFrame()
dftest['E'] = dftest['C']
dftest

Unnamed: 0,A,B,C,D,E
0,0.0,0.0,foo1,2009-01-01,foo1
1,1.0,1.0,foo2,2009-01-02,foo2
2,2.0,0.0,foo3,2009-01-05,foo3
3,3.0,1.0,foo4,2009-01-06,foo4
4,4.0,0.0,foo5,2009-01-07,foo5


In [65]:
# tip 6
# determine dataframe datatypes
dftest = pandas.util.testing.makeMixedDataFrame()
dataTypeSeries = dftest.dtypes
print(dataTypeSeries)

A           float64
B           float64
C            object
D    datetime64[ns]
dtype: object


In [66]:
# tip 7
# delete a dataframe column

# df = df.drop('column_name', 1)
# where 1 is the axis number (0 for rows and 1 for columns.)

# To delete the column without having to reassign df you can do:
# df.drop('column_name', axis=1, inplace=True)

# To drop by column number instead of by column label, try this to delete, e.g. the 1st, 2nd and 4th columns:
# df = df.drop(df.columns[[0, 1, 3]], axis=1)  
# df.columns is zero-based pd.Index 

# Working with "text" syntax for the columns:
# df.drop(['column_nameA', 'column_nameB'], axis=1, inplace=True)

dftest = pandas.util.testing.makeMixedDataFrame()
print(dftest)
dftest = dftest.drop('C', 1)
print(dftest)

     A    B     C          D
0 0.00 0.00  foo1 2009-01-01
1 1.00 1.00  foo2 2009-01-02
2 2.00 0.00  foo3 2009-01-05
3 3.00 1.00  foo4 2009-01-06
4 4.00 0.00  foo5 2009-01-07
     A    B          D
0 0.00 0.00 2009-01-01
1 1.00 1.00 2009-01-02
2 2.00 0.00 2009-01-05
3 3.00 1.00 2009-01-06
4 4.00 0.00 2009-01-07


In [67]:
# tip 8
# reorder columns (flipflop column a and column d)
dftest = pandas.util.testing.makeMixedDataFrame()
print(dftest)
dftest = dftest[['D','B','C','A']]
print(dftest)

     A    B     C          D
0 0.00 0.00  foo1 2009-01-01
1 1.00 1.00  foo2 2009-01-02
2 2.00 0.00  foo3 2009-01-05
3 3.00 1.00  foo4 2009-01-06
4 4.00 0.00  foo5 2009-01-07
           D    B     C    A
0 2009-01-01 0.00  foo1 0.00
1 2009-01-02 1.00  foo2 1.00
2 2009-01-05 0.00  foo3 2.00
3 2009-01-06 1.00  foo4 3.00
4 2009-01-07 0.00  foo5 4.00


In [68]:
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
2,"SCHNEIDER, LAURA E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21


In [69]:
# delete tpid_length column
df = df.drop('tpid_length', 1)
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr
0,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..
1,"SCHNEIDER, LAURA E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..
2,"SCHNEIDER, LAURA E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..


In [70]:
df.to_excel (r'U:\613976-AP2 Regular & OT Hours for Adv by CCC_#505ap-modified-20201014.xlsx', index = False, header=True)