In [24]:
# chavez & coombs 
# cognos_data_management_613976(v1).ipynb
# 2021-01-04
# this script makes cognos-output data compatible with chavez-created analytic processes

Steps:
1. Download the deltek data as an Excel file
2. Save 'AP2 Regular & OT Hours for Adv by CCC_#505ap.xlsx' as a CSV file
3. Run the code below

In [25]:
# to determine conda version or conda and python versions, from a cmd window at the ">" prompt:
# > conda -V
#   conda 4.2.9
# > python --version
#   Python 3.5.2 :: Anaconda 4.2.0 (64-bit)  

In [26]:
# import modules
import pandas
import os
import sys
import gc
# import numpy
# import matplotlib
# import pylab

In [27]:
# current working directory
print(os.getcwd())

\\ain2\dfsroot\userdata48\willitc0\Data


"read_csv() vs read_excel() in pandas: When to use which and why" by Ashwin A. Vardhan

Have you ever wondered if that excel file that you have, can be made to read faster instead of sitting idle for 10 minutes while your code scans through it? I’ll try to answer it here with a personal experience of mine.

So, I was performing some operations on an excel file using pandas, whose dimensions were 509579 x 240, and it had a size of 295 MB. However, reading that file took about 528 seconds (average over 10 iterations), whereas, on converting it to csv and reading it (using pandas) took just 13 seconds (again, average over 10 iterations), which is an improvement of about 40 times. As you can observe, the file was too huge, and read_excel is just slower in performance. This has been mentioned on stackoverflow too.

So, when will you actually need to convert an excel file to csv before processing it? Usually, if your excel file is small (~100,000 rows, ~50 columns), there won’t be much of a performance issue, unless, you need to run that process very frequently, because, that small delay may get compunded and bite you! But, if your excel file is massive, and you need to process it frequently (or not), it’s better to first convert the excel into a csv, and voila! see the magic happen.

In [28]:
# read data
# df = pd.read_csv (r'Path where the CSV file is stored\File name.csv')
data = pandas.read_csv (r'U:\613976 Regular & OT Hours for Adv by CCC_#505 including Project_Id 122920.csv', low_memory=False)

Pandas has the following data types:
    object
    int64
    float64
    bool
    datetime64
    timedelta[ns]
    category

In [29]:
# check metadata
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35924 entries, 0 to 35923
Data columns (total 23 columns):
User                         35924 non-null object
User ID                      35924 non-null object
Work Date                    35924 non-null object
Timesheet Period End Date    35924 non-null object
WBS                          0 non-null float64
Description                  35924 non-null object
Timesheet Class              35924 non-null object
Project ID                   35924 non-null object
Pay Type                     35924 non-null object
Vendor ID                    9490 non-null object
Corporation                  35924 non-null object
Org ID                       35924 non-null object
Company Name                 35924 non-null object
SUP_EMPL_GROUP_CD            0 non-null float64
SUP_EMPL_GROUP_DESC          0 non-null float64
MGR_EMPL_GROUP_CD            34660 non-null object
MGR_EMPL_GROUP_DESC          34660 non-null object
Status                       35924

In [30]:
# trim data
df = pandas.DataFrame(data, columns= ['User','Work Date', 'Description', 'Project ID', 'Straight', 'Overtime', 'Total'])
df.head(2)

Unnamed: 0,User,Work Date,Description,Project ID,Straight,Overtime,Total
0,"Schneider, Laura E",10-6-20,Software Services,613976.00.01.0111.509,10.0,0.0,10.0
1,"Schneider, Laura E",10-7-20,Software Services,613976.00.01.0111.509,10.0,0.0,10.0


In [31]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35924 entries, 0 to 35923
Data columns (total 7 columns):
User           35924 non-null object
Work Date      35924 non-null object
Description    35924 non-null object
Project ID     35924 non-null object
Straight       35924 non-null float64
Overtime       35924 non-null float64
Total          35924 non-null float64
dtypes: float64(3), object(4)
memory usage: 1.9+ MB


In [32]:
# check dataframe dimensions (rows x columns)
df.shape

(35924, 7)

In [33]:
# format data
# round floating numbers to two decimal places in python pandas 
pandas.options.display.float_format = '{:.2f}'.format

In [34]:
# check data types
df.dtypes

User            object
Work Date       object
Description     object
Project ID      object
Straight       float64
Overtime       float64
Total          float64
dtype: object

In [35]:
# add clin placeholder column
df.insert(3, 'CLIN', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,Project ID,Straight,Overtime,Total
0,"Schneider, Laura E",10-6-20,Software Services,,613976.00.01.0111.509,10.0,0.0,10.0
1,"Schneider, Laura E",10-7-20,Software Services,,613976.00.01.0111.509,10.0,0.0,10.0
2,"Schneider, Laura E",10-8-20,Software Services,,613976.00.01.0111.509,10.0,0.0,10.0


In [36]:
# add wbs placeholder column
df.insert(4, 'WBS', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,Project ID,Straight,Overtime,Total
0,"Schneider, Laura E",10-6-20,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0
1,"Schneider, Laura E",10-7-20,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0
2,"Schneider, Laura E",10-8-20,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0


In [37]:
# add SR placeholder column
df.insert(9, 'SR_temp', '')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,Project ID,Straight,Overtime,Total,SR_temp
0,"Schneider, Laura E",10-6-20,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,
1,"Schneider, Laura E",10-7-20,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,
2,"Schneider, Laura E",10-8-20,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,


In [38]:
# add option year column
df.insert(10, 'OptYr', 'OY..')
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,Project ID,Straight,Overtime,Total,SR_temp,OptYr
0,"Schneider, Laura E",10-6-20,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"Schneider, Laura E",10-7-20,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
2,"Schneider, Laura E",10-8-20,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [39]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35924 entries, 0 to 35923
Data columns (total 11 columns):
User           35924 non-null object
Work Date      35924 non-null object
Description    35924 non-null object
CLIN           35924 non-null object
WBS            35924 non-null object
Project ID     35924 non-null object
Straight       35924 non-null float64
Overtime       35924 non-null float64
Total          35924 non-null float64
SR_temp        35924 non-null object
OptYr          35924 non-null object
dtypes: float64(3), object(8)
memory usage: 3.0+ MB


In [40]:
# column a
df['User'].head(2)

0    Schneider, Laura E
1    Schneider, Laura E
Name: User, dtype: object

In [41]:
# column b
# modify workdate format
df['Work Date'] = pandas.to_datetime(df['Work Date']).dt.strftime('%Y-%m-%d')
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,Project ID,Straight,Overtime,Total,SR_temp,OptYr
0,"Schneider, Laura E",2020-10-06,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"Schneider, Laura E",2020-10-07,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [42]:
# column c
df['Description'].head(2)

0    Software Services
1    Software Services
Name: Description, dtype: object

In [43]:
# relabel Project ID to id
df = df.rename(columns={'Project ID': 'ProjectID'})

In [44]:
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,OptYr
0,"Schneider, Laura E",2020-10-06,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"Schneider, Laura E",2020-10-07,Software Services,,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [46]:
# column d
# get clin
df['CLIN'] = df.ProjectID.str.split(".", expand=True,)[3]
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,OptYr
0,"Schneider, Laura E",2020-10-06,Software Services,111,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"Schneider, Laura E",2020-10-07,Software Services,111,,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [47]:
# column e
# get wbs
df['WBS'] = df.ProjectID.str.split(".", expand=True,)[4]
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,OptYr
0,"Schneider, Laura E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"Schneider, Laura E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [48]:
# column f
# ProjectID
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,OptYr
0,"Schneider, Laura E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..
1,"Schneider, Laura E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..


In [49]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35924 entries, 0 to 35923
Data columns (total 11 columns):
User           35924 non-null object
Work Date      35924 non-null object
Description    35924 non-null object
CLIN           35924 non-null object
WBS            35924 non-null object
ProjectID      35924 non-null object
Straight       35924 non-null float64
Overtime       35924 non-null float64
Total          35924 non-null float64
SR_temp        35924 non-null object
OptYr          35924 non-null object
dtypes: float64(3), object(8)
memory usage: 3.0+ MB


In [50]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35924 entries, 0 to 35923
Data columns (total 23 columns):
User                         35924 non-null object
User ID                      35924 non-null object
Work Date                    35924 non-null object
Timesheet Period End Date    35924 non-null object
WBS                          0 non-null float64
Description                  35924 non-null object
Timesheet Class              35924 non-null object
Project ID                   35924 non-null object
Pay Type                     35924 non-null object
Vendor ID                    9490 non-null object
Corporation                  35924 non-null object
Org ID                       35924 non-null object
Company Name                 35924 non-null object
SUP_EMPL_GROUP_CD            0 non-null float64
SUP_EMPL_GROUP_DESC          0 non-null float64
MGR_EMPL_GROUP_CD            34660 non-null object
MGR_EMPL_GROUP_DESC          34660 non-null object
Status                       35924

In [53]:
# what size are some of the objects?
print('data', sys.getsizeof(data))
print('df', sys.getsizeof(df))

data 41848945
df 20067149


In [54]:
# delete original dataframe to free up memory
del[data]

In [55]:
gc.collect()

497

In [58]:
# add ProjectID character count column
df['id_length'] = df['ProjectID'].str.len()
df.id_length.head(2)

0    21
1    21
Name: id_length, dtype: int64

In [59]:
df.id_length.unique()

array([21, 29], dtype=int64)

In [60]:
# where id_length = 29
df['id_length'][df['id_length'] == 29].head(3)

83    29
84    29
85    29
Name: id_length, dtype: int64

In [61]:
df.head(2)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,OptYr,id_length
0,"Schneider, Laura E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21
1,"Schneider, Laura E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21


In [62]:
# where tpid_length != 29
df[df['id_length'] != 29].head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,OptYr,id_length
0,"Schneider, Laura E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21
1,"Schneider, Laura E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21
2,"Schneider, Laura E",2020-10-08,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21


In [63]:
# filter on id_length
df21 = df[df['id_length'] != 29]
df21.shape

(29615, 12)

In [64]:
df21.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,OptYr,id_length
0,"Schneider, Laura E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21
1,"Schneider, Laura E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21
2,"Schneider, Laura E",2020-10-08,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,OY..,21


In [65]:
# filter on id_length
df29 = df[df['id_length'] == 29]
df29.shape

(6309, 12)

In [66]:
# check size (should equal zero)
df.shape[0] - (df21.shape[0] + df29.shape[0])

0

In [67]:
# memory management
del[df]
gc.collect()

437

In [68]:
# SR column
df21.insert(10, 'SR', 'SR-0000000')
df21.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,SR,OptYr,id_length
0,"Schneider, Laura E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,SR-0000000,OY..,21
1,"Schneider, Laura E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,SR-0000000,OY..,21
2,"Schneider, Laura E",2020-10-08,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,,SR-0000000,OY..,21


In [69]:
# delete SR_temp column
df21 = df21.drop('SR_temp', 1)
df21.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR,OptYr,id_length
0,"Schneider, Laura E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
1,"Schneider, Laura E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
2,"Schneider, Laura E",2020-10-08,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21


In [70]:
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,OptYr,id_length
83,"Cherry, Christopher H",2020-10-19,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,OY..,29
84,"Cherry, Christopher H",2020-10-20,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,OY..,29
85,"Cherry, Christopher H",2020-10-21,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,OY..,29


In [71]:
# get sr from tpid
df29['SR_no'] = df29.ProjectID.str.split(".", expand=True,)[5]
df29.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,OptYr,id_length,SR_no
83,"Cherry, Christopher H",2020-10-19,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,OY..,29,5838381
84,"Cherry, Christopher H",2020-10-20,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,OY..,29,5838381
85,"Cherry, Christopher H",2020-10-21,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,OY..,29,5838381


In [72]:
df29.insert(10, 'SR_prefix', 'SR-')
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,SR_prefix,OptYr,id_length,SR_no
83,"Cherry, Christopher H",2020-10-19,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,SR-,OY..,29,5838381
84,"Cherry, Christopher H",2020-10-20,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,SR-,OY..,29,5838381
85,"Cherry, Christopher H",2020-10-21,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,SR-,OY..,29,5838381


In [73]:
df29['SR'] = df29['SR_prefix'] + df29['SR_no']
df29.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_temp,SR_prefix,OptYr,id_length,SR_no,SR
83,"Cherry, Christopher H",2020-10-19,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,SR-,OY..,29,5838381,SR-5838381
84,"Cherry, Christopher H",2020-10-20,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,SR-,OY..,29,5838381,SR-5838381
85,"Cherry, Christopher H",2020-10-21,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,,SR-,OY..,29,5838381,SR-5838381


In [74]:
# delete SR_temp column
df29 = df29.drop('SR_temp', 1)
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR_prefix,OptYr,id_length,SR_no,SR
83,"Cherry, Christopher H",2020-10-19,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,SR-,OY..,29,5838381,SR-5838381
84,"Cherry, Christopher H",2020-10-20,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,SR-,OY..,29,5838381,SR-5838381
85,"Cherry, Christopher H",2020-10-21,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,SR-,OY..,29,5838381,SR-5838381


In [75]:
# delete SR_prefix column
df29 = df29.drop('SR_prefix', 1)
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,OptYr,id_length,SR_no,SR
83,"Cherry, Christopher H",2020-10-19,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,OY..,29,5838381,SR-5838381
84,"Cherry, Christopher H",2020-10-20,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,OY..,29,5838381,SR-5838381
85,"Cherry, Christopher H",2020-10-21,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,OY..,29,5838381,SR-5838381


In [76]:
# delete SR_no column
df29 = df29.drop('SR_no', 1)
df29.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,OptYr,id_length,SR
83,"Cherry, Christopher H",2020-10-19,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,OY..,29,SR-5838381
84,"Cherry, Christopher H",2020-10-20,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,OY..,29,SR-5838381
85,"Cherry, Christopher H",2020-10-21,Installation-Fixed Price,114,0,613976.00.01.0114.000.5838381,8.0,0.0,8.0,OY..,29,SR-5838381


In [77]:
# standardize column order
df21 = df21[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'ProjectID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr', 'id_length']]
df29 = df29[['User', 'Work Date', 'Description', 'CLIN', 'WBS', 'ProjectID', 'Straight', 'Overtime', 'Total', 'SR', 'OptYr', 'id_length']]

In [78]:
# concatenate vertical dataframes
df = pandas.concat([df21, df29], axis=0)

In [79]:
df21.shape

(29615, 12)

In [80]:
df29.shape

(6309, 12)

In [81]:
# check columns
df.shape

(35924, 12)

In [82]:
# check size (should equal zero)
df.shape[0] - (df21.shape[0] + df29.shape[0])

0

In [83]:
# memory management
del[df21, df29]

In [84]:
gc.collect()

274

In [85]:
# write data to csv (default is U:\Documents)
# df.to_csv('AP2 Regular & OT Hours for Adv by CCC_#505ap-modified.csv')

In [86]:
# check metadata
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35924 entries, 0 to 35854
Data columns (total 12 columns):
User           35924 non-null object
Work Date      35924 non-null object
Description    35924 non-null object
CLIN           35924 non-null object
WBS            35924 non-null object
ProjectID      35924 non-null object
Straight       35924 non-null float64
Overtime       35924 non-null float64
Total          35924 non-null float64
SR             35924 non-null object
OptYr          35924 non-null object
id_length      35924 non-null int64
dtypes: float64(3), int64(1), object(8)
memory usage: 3.6+ MB


In [87]:
# data check
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR,OptYr,id_length
0,"Schneider, Laura E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
1,"Schneider, Laura E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
2,"Schneider, Laura E",2020-10-08,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21


In [88]:
# tip 1
# create dataframe with mixed data types
pandas.util.testing.makeMixedDataFrame()

Unnamed: 0,A,B,C,D
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [89]:
# tip 2
# insert a new column which is a function of another column
dftest = pandas.util.testing.makeMixedDataFrame()
#print(dftest)
dftest.insert(3, 'A2', dftest['A'] * 2)
dftest

Unnamed: 0,A,B,C,A2,D
0,0.0,0.0,foo1,0.0,2009-01-01
1,1.0,1.0,foo2,2.0,2009-01-02
2,2.0,0.0,foo3,4.0,2009-01-05
3,3.0,1.0,foo4,6.0,2009-01-06
4,4.0,0.0,foo5,8.0,2009-01-07


In [90]:
# tip 3
# insert a new column with a constant value in the column with index 3 (i.e., 4)
dftest = pandas.util.testing.makeMixedDataFrame()
dftest.insert(3, 'Country', 'USA')
dftest

Unnamed: 0,A,B,C,Country,D
0,0.0,0.0,foo1,USA,2009-01-01
1,1.0,1.0,foo2,USA,2009-01-02
2,2.0,0.0,foo3,USA,2009-01-05
3,3.0,1.0,foo4,USA,2009-01-06
4,4.0,0.0,foo5,USA,2009-01-07


In [91]:
# tip 4
# change column names
dftest = pandas.util.testing.makeMixedDataFrame()
dftest
dftest.rename(columns={'A':'ID', 'D':'Date'})

Unnamed: 0,ID,B,C,Date
0,0.0,0.0,foo1,2009-01-01
1,1.0,1.0,foo2,2009-01-02
2,2.0,0.0,foo3,2009-01-05
3,3.0,1.0,foo4,2009-01-06
4,4.0,0.0,foo5,2009-01-07


In [92]:
# tip 5
# duplicate a column
dftest = pandas.util.testing.makeMixedDataFrame()
dftest['E'] = dftest['C']
dftest

Unnamed: 0,A,B,C,D,E
0,0.0,0.0,foo1,2009-01-01,foo1
1,1.0,1.0,foo2,2009-01-02,foo2
2,2.0,0.0,foo3,2009-01-05,foo3
3,3.0,1.0,foo4,2009-01-06,foo4
4,4.0,0.0,foo5,2009-01-07,foo5


In [93]:
# tip 6
# determine dataframe datatypes
dftest = pandas.util.testing.makeMixedDataFrame()
dataTypeSeries = dftest.dtypes
print(dataTypeSeries)

A           float64
B           float64
C            object
D    datetime64[ns]
dtype: object


In [94]:
# tip 7
# delete a dataframe column

# df = df.drop('column_name', 1)
# where 1 is the axis number (0 for rows and 1 for columns.)

# To delete the column without having to reassign df you can do:
# df.drop('column_name', axis=1, inplace=True)

# To drop by column number instead of by column label, try this to delete, e.g. the 1st, 2nd and 4th columns:
# df = df.drop(df.columns[[0, 1, 3]], axis=1)  
# df.columns is zero-based pd.Index 

# Working with "text" syntax for the columns:
# df.drop(['column_nameA', 'column_nameB'], axis=1, inplace=True)

dftest = pandas.util.testing.makeMixedDataFrame()
print(dftest)
dftest = dftest.drop('C', 1)
print(dftest)

     A    B     C          D
0 0.00 0.00  foo1 2009-01-01
1 1.00 1.00  foo2 2009-01-02
2 2.00 0.00  foo3 2009-01-05
3 3.00 1.00  foo4 2009-01-06
4 4.00 0.00  foo5 2009-01-07
     A    B          D
0 0.00 0.00 2009-01-01
1 1.00 1.00 2009-01-02
2 2.00 0.00 2009-01-05
3 3.00 1.00 2009-01-06
4 4.00 0.00 2009-01-07


In [95]:
# tip 8
# reorder columns (flipflop column a and column d)
dftest = pandas.util.testing.makeMixedDataFrame()
print(dftest)
dftest = dftest[['D','B','C','A']]
print(dftest)

     A    B     C          D
0 0.00 0.00  foo1 2009-01-01
1 1.00 1.00  foo2 2009-01-02
2 2.00 0.00  foo3 2009-01-05
3 3.00 1.00  foo4 2009-01-06
4 4.00 0.00  foo5 2009-01-07
           D    B     C    A
0 2009-01-01 0.00  foo1 0.00
1 2009-01-02 1.00  foo2 1.00
2 2009-01-05 0.00  foo3 2.00
3 2009-01-06 1.00  foo4 3.00
4 2009-01-07 0.00  foo5 4.00


In [96]:
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR,OptYr,id_length
0,"Schneider, Laura E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
1,"Schneider, Laura E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21
2,"Schneider, Laura E",2020-10-08,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..,21


In [97]:
# delete tpid_length column
df = df.drop('id_length', 1)
df.head(3)

Unnamed: 0,User,Work Date,Description,CLIN,WBS,ProjectID,Straight,Overtime,Total,SR,OptYr
0,"Schneider, Laura E",2020-10-06,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..
1,"Schneider, Laura E",2020-10-07,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..
2,"Schneider, Laura E",2020-10-08,Software Services,111,509,613976.00.01.0111.509,10.0,0.0,10.0,SR-0000000,OY..


In [98]:
df.to_excel (r'U:\613976-AP2 Regular & OT Hours for Adv by CCC_#505ap-modified-20210104.xlsx', index = False, header=True)