In [1]:
# Dependencies
import pandas as pd

In [2]:
# Name of the CSV file
file = 'Resources/tas_employment.csv'

In [3]:
# The correct encoding must be used to read the CSV in pandas
df = pd.read_csv(file)

In [4]:
# Preview of the DataFrame
# Note that OCCUP_INDEX appears to be duplicating OCCUP
df.head()

Unnamed: 0,NAME,AGE,OCCUP,YEAR,NAME_RANK,EMPLOY_DATE,EMPLOYER,OCCUP_INDEX,RECORD_URL
0,"Crowther, Edward Lodewyk",33.0,Doctor of Medicine,1878,Commanding Officer,23 Apr 1878,Southern Volunteer Artillery,Doctor of Medicine,https://stors.tas.gov.au/NI/1517087
1,"Crowther, William L",63.0,Doctor of Medicine,1878,Surgeon Major,10 May 1878,Southern Volunteer Artillery,Doctor of Medicine,https://stors.tas.gov.au/NI/1517088
2,"Roblin, Thomas",50.0,Curator of Museum,1878,Lieutenant,23 Apr 1878,Southern Volunteer Artillery,Curator of Museum,https://stors.tas.gov.au/NI/1517089
3,"Lewis, D",,Merchant,1878,Major Paymaster,23 Apr 1878,Southern Volunteer Artillery,Merchant,https://stors.tas.gov.au/NI/1517090
4,"Green, William Patrick",,Gentleman,1878,Quartermaster Captain,16 Aug 1878,Southern Volunteer Artillery,Gentleman,https://stors.tas.gov.au/NI/1517091


In [5]:
# Delete column we don't want
del df['OCCUP_INDEX']
df.head()

Unnamed: 0,NAME,AGE,OCCUP,YEAR,NAME_RANK,EMPLOY_DATE,EMPLOYER,RECORD_URL
0,"Crowther, Edward Lodewyk",33.0,Doctor of Medicine,1878,Commanding Officer,23 Apr 1878,Southern Volunteer Artillery,https://stors.tas.gov.au/NI/1517087
1,"Crowther, William L",63.0,Doctor of Medicine,1878,Surgeon Major,10 May 1878,Southern Volunteer Artillery,https://stors.tas.gov.au/NI/1517088
2,"Roblin, Thomas",50.0,Curator of Museum,1878,Lieutenant,23 Apr 1878,Southern Volunteer Artillery,https://stors.tas.gov.au/NI/1517089
3,"Lewis, D",,Merchant,1878,Major Paymaster,23 Apr 1878,Southern Volunteer Artillery,https://stors.tas.gov.au/NI/1517090
4,"Green, William Patrick",,Gentleman,1878,Quartermaster Captain,16 Aug 1878,Southern Volunteer Artillery,https://stors.tas.gov.au/NI/1517091


In [6]:
# Identify incomplete rows
df.count()

NAME           4325
AGE             811
OCCUP          3739
YEAR           4172
NAME_RANK       675
EMPLOY_DATE     852
EMPLOYER        877
RECORD_URL     4325
dtype: int64

In [7]:
# Drop all rows with missing information
df = df.dropna(how='any')

In [8]:
# Verify dropped rows
df.count()

NAME           613
AGE            613
OCCUP          613
YEAR           613
NAME_RANK      613
EMPLOY_DATE    613
EMPLOYER       613
RECORD_URL     613
dtype: int64

In [9]:
# The YEAR column is the wrong data type. It should be an integer.
df.dtypes

NAME            object
AGE            float64
OCCUP           object
YEAR            object
NAME_RANK       object
EMPLOY_DATE     object
EMPLOYER        object
RECORD_URL      object
dtype: object

In [10]:
# Use df.astype() method to convert the datatype of the YEAR column
df = df.astype({"YEAR": int}, errors='raise')

In [11]:
# Verify that the YEAR column datatype has been made an integer
df['YEAR'].dtype

dtype('int64')

In [12]:
# Display an overview of the OCCUP column
df['OCCUP'].value_counts()

Clerk                 85
Labourer              41
Bootmaker             40
Carpenter             30
Blacksmith            23
                      ..
Tin Carrier            1
Hatter                 1
Chemists Assistant     1
None at present        1
Solderer               1
Name: OCCUP, Length: 170, dtype: int64

In [13]:
# Clean up OCCUP category. Replace 'Laborer' with 'Labourer',
# 'Stone Mason' with 'Stonemason', 'Boot Maker' with 'Bootmaker'
# 'Coachtrimmer' with 'Coach Trimmer', and 'None' with 'None at present'
df['OCCUP'] = df['OCCUP'].replace({'Laborer': 'Labourer', 
                                   'Stone Mason': 'Stonemason', 
                                   'Boot Maker': 'Bootmaker',
                                   'Coachtrimmer': 'Coach Trimmer', 
                                   'None': 'None at present'})

In [14]:
# Verify clean-up.
df['OCCUP'].value_counts()

Clerk                   85
Labourer                50
Bootmaker               42
Carpenter               30
Blacksmith              23
                        ..
Tin Carrier              1
Hatter                   1
Government Messenger     1
Estate Agent             1
Drayman                  1
Name: OCCUP, Length: 165, dtype: int64

In [15]:
# Display a statistical overview
# We can infer the maximum allowable individual contribution from 'max'
df.describe()

Unnamed: 0,AGE,YEAR
count,613.0,613.0
mean,22.168026,1884.690049
std,6.170202,6.250635
min,16.0,1843.0
25%,18.0,1879.0
50%,20.0,1883.0
75%,24.0,1889.0
max,71.0,1899.0


In [16]:
df.to_csv("Resources/tas_employment_clean.csv", index=False)