In [6]:
# chavez & coombs 
# cognos_data_management_13976(v3).ipynb
# 2020-08-14
# this script makes cognos-output data compatible with chavez-created analytic processes

Steps:
1. Download the deltek data as an Excel file
2. Save 'AP2 Regular & OT Hours for Adv by CCC_#505ap.xlsx' as a CSV file
3. Run the code below

In [7]:
# to determine conda version or conda and python versions, from a cmd window at the ">" prompt:
# > conda -V
#   conda 4.2.9
# > python --version
#   Python 3.5.2 :: Anaconda 4.2.0 (64-bit)  

In [8]:
# import modules
import pandas
import os
import sys
import gc
# import numpy
# import matplotlib
# import pylab

In [9]:
# current working directory
print(os.getcwd())

\\ain2\dfsroot\userdata48\willitc0\Data


"read_csv() vs read_excel() in pandas: When to use which and why" by Ashwin A. Vardhan

Have you ever wondered if that excel file that you have, can be made to read faster instead of sitting idle for 10 minutes while your code scans through it? I’ll try to answer it here with a personal experience of mine.

So, I was performing some operations on an excel file using pandas, whose dimensions were 509579 x 240, and it had a size of 295 MB. However, reading that file took about 528 seconds (average over 10 iterations), whereas, on converting it to csv and reading it (using pandas) took just 13 seconds (again, average over 10 iterations), which is an improvement of about 40 times. As you can observe, the file was too huge, and read_excel is just slower in performance. This has been mentioned on stackoverflow too.

So, when will you actually need to convert an excel file to csv before processing it? Usually, if your excel file is small (~100,000 rows, ~50 columns), there won’t be much of a performance issue, unless, you need to run that process very frequently, because, that small delay may get compunded and bite you! But, if your excel file is massive, and you need to process it frequently (or not), it’s better to first convert the excel into a csv, and voila! see the magic happen.

In [10]:
# read data
# df = pd.read_csv (r'Path where the CSV file is stored\File name.csv')
data = pandas.read_csv (r'U:\AP2 Regular & OT Hours for Adv by CCC_#505ap.csv', low_memory=False)

Pandas has the following data types:
    object
    int64
    float64
    bool
    datetime64
    timedelta[ns]
    category

In [11]:
# check metadata
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224524 entries, 0 to 224523
Data columns (total 23 columns):
User                 224524 non-null object
User Id              224524 non-null object
Work Date            224524 non-null object
Month                224524 non-null object
Description          224524 non-null object
REF ID               224524 non-null int64
CLIN                 224524 non-null object
WBS                  38891 non-null object
TASK                 224524 non-null object
TPID                 224524 non-null object
ProjNo               224524 non-null object
WB2                  224524 non-null object
Project Id           224524 non-null object
Pay Type             224524 non-null object
Corporation          224524 non-null object
Company Name         224524 non-null object
Approver             224524 non-null object
Status               224524 non-null object
Straight             224524 non-null float64
Overtime             224524 non-null float64
Total    

In [12]:
# trim data
df = pandas.DataFrame(data, columns= ['User','Work Date', 'Description', 'TPID', 'Straight', 'Overtime', 'Total'])
df.head(2)

Unnamed: 0,User,Work Date,Description,TPID,Straight,Overtime,Total
0,"SCHNEIDER, LAURA E","Jul 14, 2020",Standby Only - COVID-19,13976.00.01.0111.999,0.0,0.0,0.0
1,"SCHNEIDER, LAURA E","Jul 17, 2020",Standby Only - COVID-19,13976.00.01.0111.999,0.0,0.0,0.0


In [13]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224524 entries, 0 to 224523
Data columns (total 7 columns):
User           224524 non-null object
Work Date      224524 non-null object
Description    224524 non-null object
TPID           224524 non-null object
Straight       224524 non-null float64
Overtime       224524 non-null float64
Total          224524 non-null float64
dtypes: float64(3), object(4)
memory usage: 12.0+ MB


In [14]:
# check dataframe dimensions (rows x columns)
df.shape

(224524, 7)

In [15]:
# format data
# round floating numbers to two decimal places in python pandas 
pandas.options.display.float_format = '{:.2f}'.format

In [16]:
# check data types
df.dtypes

User            object
Work Date       object
Description     object
TPID            object
Straight       float64
Overtime       float64
Total          float64
dtype: object

In [17]:
# add clin placeholder column
df.insert(3, 'CLIN', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,TPID,Straight,Overtime,Total
0,"SCHNEIDER, LAURA E","Jul 14, 2020",Standby Only - COVID-19,,13976.00.01.0111.999,0.0,0.0,0.0
1,"SCHNEIDER, LAURA E","Jul 17, 2020",Standby Only - COVID-19,,13976.00.01.0111.999,0.0,0.0,0.0
2,"SCHNEIDER, LAURA E","Jul 20, 2020",Standby Only - COVID-19,,13976.00.01.0111.999,8.0,0.0,8.0


In [18]:
# add wbs placeholder column
df.insert(4, 'WBS', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total
0,"SCHNEIDER, LAURA E","Jul 14, 2020",Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0
1,"SCHNEIDER, LAURA E","Jul 17, 2020",Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0
2,"SCHNEIDER, LAURA E","Jul 20, 2020",Standby Only - COVID-19,,,13976.00.01.0111.999,8.0,0.0,8.0


In [19]:
# add SR placeholder column
df.insert(9, 'SR_temp', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp
0,"SCHNEIDER, LAURA E","Jul 14, 2020",Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,
1,"SCHNEIDER, LAURA E","Jul 17, 2020",Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,
2,"SCHNEIDER, LAURA E","Jul 20, 2020",Standby Only - COVID-19,,,13976.00.01.0111.999,8.0,0.0,8.0,


In [20]:
# add option year column
df.insert(10, 'OptYr', 'OY.')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E","Jul 14, 2020",Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
1,"SCHNEIDER, LAURA E","Jul 17, 2020",Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
2,"SCHNEIDER, LAURA E","Jul 20, 2020",Standby Only - COVID-19,,,13976.00.01.0111.999,8.0,0.0,8.0,,OY.


In [21]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224524 entries, 0 to 224523
Data columns (total 11 columns):
User           224524 non-null object
Work Date      224524 non-null object
Description    224524 non-null object
CLIN           224524 non-null object
WBS            224524 non-null object
TPID           224524 non-null object
Straight       224524 non-null float64
Overtime       224524 non-null float64
Total          224524 non-null float64
SR_temp        224524 non-null object
OptYr          224524 non-null object
dtypes: float64(3), object(8)
memory usage: 18.8+ MB


In [22]:
# column a
df['User'].head(2)

0    SCHNEIDER, LAURA E
1    SCHNEIDER, LAURA E
Name: User, dtype: object

In [23]:
# column b
# modify workdate format
df['Work Date'] = pandas.to_datetime(df['Work Date']).dt.strftime('%Y-%m-%d')
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.


In [24]:
# column c
df['Description'].head(2)

0    Standby Only - COVID-19
1    Standby Only - COVID-19
Name: Description, dtype: object

In [25]:
# column d
# get clin from tpid
df['CLIN'] = data.TPID.str.split(".", expand=True,)[3]
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,,13976.00.01.0111.999,0.0,0.0,0.0,,OY.


In [26]:
# column e
# get wbs from tpid
df['WBS'] = data.TPID.str.split(".", expand=True,)[4]
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.


In [27]:
# column f
# tpid
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.


In [28]:
# save syntax to modfiy column order
# df = df[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'TPID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr']]

In [29]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224524 entries, 0 to 224523
Data columns (total 11 columns):
User           224524 non-null object
Work Date      224524 non-null object
Description    224524 non-null object
CLIN           224524 non-null object
WBS            224524 non-null object
TPID           224524 non-null object
Straight       224524 non-null float64
Overtime       224524 non-null float64
Total          224524 non-null float64
SR_temp        224524 non-null object
OptYr          224524 non-null object
dtypes: float64(3), object(8)
memory usage: 18.8+ MB


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224524 entries, 0 to 224523
Data columns (total 23 columns):
User                 224524 non-null object
User Id              224524 non-null object
Work Date            224524 non-null object
Month                224524 non-null object
Description          224524 non-null object
REF ID               224524 non-null int64
CLIN                 224524 non-null object
WBS                  38891 non-null object
TASK                 224524 non-null object
TPID                 224524 non-null object
ProjNo               224524 non-null object
WB2                  224524 non-null object
Project Id           224524 non-null object
Pay Type             224524 non-null object
Corporation          224524 non-null object
Company Name         224524 non-null object
Approver             224524 non-null object
Status               224524 non-null object
Straight             224524 non-null float64
Overtime             224524 non-null float64
Total    

In [31]:
# what size are some of the objects?
print('data', sys.getsizeof(data))
print('df', sys.getsizeof(df))
print('Straight', sys.getsizeof(df['Straight']))
print('TPID', sys.getsizeof(df['TPID']))

data 287816297
df 124849967
Straight 1796296
TPID 17599580


In [32]:
# delete original dataframe to free up memory
del[data]

In [33]:
# save delete datafrome column syntax
# df = df.drop('Company Name', 1)

In [34]:
gc.collect()

61

In [35]:
# add tpid character count column
df['tpid_length'] = df['TPID'].str.len()
df['tpid_length'].head(3)

0    20
1    20
2    20
Name: tpid_length, dtype: int64

In [36]:
# where tpid_length = 28
df['tpid_length'][df['tpid_length'] == 28].head(3)

2338    28
2339    28
2340    28
Name: tpid_length, dtype: int64

In [37]:
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20


In [38]:
# where tpid_length != 28
df[df['tpid_length'] != 28].head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,,OY.,20


In [39]:
# filter on tpid_length
df20 = df[df['tpid_length'] != 28]
df20.shape

(185633, 12)

In [40]:
df20.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,,OY.,20


In [41]:
# filter on tpid_length
df28 = df[df['tpid_length'] == 28]
df28.shape

(38891, 12)

In [42]:
del[df]
gc.collect()

461

In [43]:
# SR column
df20.insert(10, 'SR', 'SR-0000000')
df20.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,SR-0000000,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,,SR-0000000,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,,SR-0000000,OY.,20


In [44]:
# delete SR_temp column
df20 = df20.drop('SR_temp', 1)
df20.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,SR-0000000,OY.,20


In [45]:
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length
2338,"TIMMONS, CARL A",2019-09-23,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,OY.,28
2339,"TIMMONS, CARL A",2019-09-24,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,OY.,28
2340,"TIMMONS, CARL A",2019-09-27,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,OY.,28


In [46]:
# get sr from tpid
df28['SR_no'] = df28.TPID.str.split(".", expand=True,)[5]
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,OptYr,tpid_length,SR_no
2338,"TIMMONS, CARL A",2019-09-23,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,OY.,28,5262550
2339,"TIMMONS, CARL A",2019-09-24,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,OY.,28,5262550
2340,"TIMMONS, CARL A",2019-09-27,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,OY.,28,5262550


In [47]:
df28.insert(10, 'SR_prefix', 'SR-')
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,SR_prefix,OptYr,tpid_length,SR_no
2338,"TIMMONS, CARL A",2019-09-23,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,SR-,OY.,28,5262550
2339,"TIMMONS, CARL A",2019-09-24,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,SR-,OY.,28,5262550
2340,"TIMMONS, CARL A",2019-09-27,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,SR-,OY.,28,5262550


In [48]:
df28['SR'] = df28['SR_prefix'] + df28['SR_no']
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_temp,SR_prefix,OptYr,tpid_length,SR_no,SR
2338,"TIMMONS, CARL A",2019-09-23,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,SR-,OY.,28,5262550,SR-5262550
2339,"TIMMONS, CARL A",2019-09-24,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,SR-,OY.,28,5262550,SR-5262550
2340,"TIMMONS, CARL A",2019-09-27,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,,SR-,OY.,28,5262550,SR-5262550


In [49]:
# delete SR_temp column
df28 = df28.drop('SR_temp', 1)
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR_prefix,OptYr,tpid_length,SR_no,SR
2338,"TIMMONS, CARL A",2019-09-23,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,SR-,OY.,28,5262550,SR-5262550
2339,"TIMMONS, CARL A",2019-09-24,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,SR-,OY.,28,5262550,SR-5262550
2340,"TIMMONS, CARL A",2019-09-27,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,SR-,OY.,28,5262550,SR-5262550


In [50]:
# delete SR_prefix column
df28 = df28.drop('SR_prefix', 1)
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,OptYr,tpid_length,SR_no,SR
2338,"TIMMONS, CARL A",2019-09-23,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,OY.,28,5262550,SR-5262550
2339,"TIMMONS, CARL A",2019-09-24,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,OY.,28,5262550,SR-5262550
2340,"TIMMONS, CARL A",2019-09-27,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,OY.,28,5262550,SR-5262550


In [51]:
# delete SR_no column
df28 = df28.drop('SR_no', 1)
df28.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,OptYr,tpid_length,SR
2338,"TIMMONS, CARL A",2019-09-23,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,OY.,28,SR-5262550
2339,"TIMMONS, CARL A",2019-09-24,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,OY.,28,SR-5262550
2340,"TIMMONS, CARL A",2019-09-27,Installation-Fixed Price,14,0,13976.00.00.0014.000.5262550,8.0,0.0,8.0,OY.,28,SR-5262550


In [52]:
# standardize column order
df20 = df20[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'TPID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr', 'tpid_length']]
df28 = df28[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'TPID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr', 'tpid_length']]

In [53]:
# concatenate vertical dataframes
df = pandas.concat([df20, df28], axis=0)

In [54]:
df20.shape

(185633, 12)

In [55]:
df28.shape

(38891, 12)

In [56]:
# check columns
df.shape

(224524, 12)

In [57]:
del[df20, df28]

In [58]:
gc.collect()

277

In [59]:
# write data to csv (default is U:\Documents)
df.to_csv('AP2 Regular & OT Hours for Adv by CCC_#505ap-modified.csv')

In [60]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 224524 entries, 0 to 224482
Data columns (total 12 columns):
User           224524 non-null object
Work Date      224524 non-null object
Description    224524 non-null object
CLIN           224524 non-null object
WBS            224524 non-null object
TPID           224524 non-null object
Straight       224524 non-null float64
Overtime       224524 non-null float64
Total          224524 non-null float64
SR             224524 non-null object
OptYr          224524 non-null object
tpid_length    224524 non-null int64
dtypes: float64(3), int64(1), object(8)
memory usage: 22.3+ MB


In [61]:
# data check
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,SR-0000000,OY.,20


In [62]:
# tip 1
# create dataframe with mixed data types
pandas.util.testing.makeMixedDataFrame()

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [63]:
# tip 2
# insert a new column which is a function of another column
dftest = pandas.util.testing.makeMixedDataFrame()
#print(dftest)
dftest.insert(3, 'A2', dftest['A'] * 2)
dftest

Unnamed: 0,A,B,C,A2,D
0,0.0,0.0,foo1,0.0,2009-01-01
1,1.0,1.0,foo2,2.0,2009-01-02
2,2.0,0.0,foo3,4.0,2009-01-05
3,3.0,1.0,foo4,6.0,2009-01-06
4,4.0,0.0,foo5,8.0,2009-01-07


In [64]:
# tip 3
# insert a new column with a constant value in the column with index 3 (i.e., 4)
dftest = pandas.util.testing.makeMixedDataFrame()
dftest.insert(3, 'Country', 'USA')
dftest

Unnamed: 0,A,B,C,Country,D
0,0.0,0.0,foo1,USA,2009-01-01
1,1.0,1.0,foo2,USA,2009-01-02
2,2.0,0.0,foo3,USA,2009-01-05
3,3.0,1.0,foo4,USA,2009-01-06
4,4.0,0.0,foo5,USA,2009-01-07


In [65]:
# tip 4
# change column names
dftest = pandas.util.testing.makeMixedDataFrame()
dftest
dftest.rename(columns={'A':'ID', 'D':'Date'})

Unnamed: 0,ID,B,C,Date
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [66]:
# tip 5
# duplicate a column
dftest = pandas.util.testing.makeMixedDataFrame()
dftest['E'] = dftest['C']
dftest

Unnamed: 0,A,B,C,D,E
0,0.0,0.0,foo1,2009-01-01,foo1
1,1.0,1.0,foo2,2009-01-02,foo2
2,2.0,0.0,foo3,2009-01-05,foo3
3,3.0,1.0,foo4,2009-01-06,foo4
4,4.0,0.0,foo5,2009-01-07,foo5


In [67]:
# tip 6
# determine dataframe datatypes
dftest = pandas.util.testing.makeMixedDataFrame()
dataTypeSeries = dftest.dtypes
print(dataTypeSeries)

A           float64
B           float64
C            object
D    datetime64[ns]
dtype: object


In [68]:
# tip 7
# delete a dataframe column

# df = df.drop('column_name', 1)
# where 1 is the axis number (0 for rows and 1 for columns.)

# To delete the column without having to reassign df you can do:
# df.drop('column_name', axis=1, inplace=True)

# To drop by column number instead of by column label, try this to delete, e.g. the 1st, 2nd and 4th columns:
# df = df.drop(df.columns[[0, 1, 3]], axis=1)  
# df.columns is zero-based pd.Index 

# Working with "text" syntax for the columns:
# df.drop(['column_nameA', 'column_nameB'], axis=1, inplace=True)

dftest = pandas.util.testing.makeMixedDataFrame()
print(dftest)
dftest = dftest.drop('C', 1)
print(dftest)

     A    B     C          D
0 0.00 0.00  foo1 2009-01-01
1 1.00 1.00  foo2 2009-01-02
2 2.00 0.00  foo3 2009-01-05
3 3.00 1.00  foo4 2009-01-06
4 4.00 0.00  foo5 2009-01-07
     A    B          D
0 0.00 0.00 2009-01-01
1 1.00 1.00 2009-01-02
2 2.00 0.00 2009-01-05
3 3.00 1.00 2009-01-06
4 4.00 0.00 2009-01-07


In [69]:
# tip 8
# reorder columns (flipflop column a and column d)
dftest = pandas.util.testing.makeMixedDataFrame()
print(dftest)
dftest = dftest[['D','B','C','A']]
print(dftest)

     A    B     C          D
0 0.00 0.00  foo1 2009-01-01
1 1.00 1.00  foo2 2009-01-02
2 2.00 0.00  foo3 2009-01-05
3 3.00 1.00  foo4 2009-01-06
4 4.00 0.00  foo5 2009-01-07
           D    B     C    A
0 2009-01-01 0.00  foo1 0.00
1 2009-01-02 1.00  foo2 1.00
2 2009-01-05 0.00  foo3 2.00
3 2009-01-06 1.00  foo4 3.00
4 2009-01-07 0.00  foo5 4.00


In [70]:
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr,tpid_length
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.,20
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.,20
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,SR-0000000,OY.,20


In [71]:
# delete tpid_length column
df = df.drop('tpid_length', 1)
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,TPID,Straight,Overtime,Total,SR,OptYr
0,"SCHNEIDER, LAURA E",2020-07-14,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.
1,"SCHNEIDER, LAURA E",2020-07-17,Standby Only - COVID-19,111,999,13976.00.01.0111.999,0.0,0.0,0.0,SR-0000000,OY.
2,"SCHNEIDER, LAURA E",2020-07-20,Standby Only - COVID-19,111,999,13976.00.01.0111.999,8.0,0.0,8.0,SR-0000000,OY.


In [72]:
df.to_excel (r'U:\AP2 Regular & OT Hours for Adv by CCC_#505ap-modified.xlsx', index = False, header=True)