# Dependencies and read data

In [1]:
# Dependencies and Setup
import pandas as pd

# File to Load 
file = "Data/migration_data.csv"

# Read file and store into Pandas data frame
migration_data = pd.read_csv(file)

# Test whether data has been read
migration_data.head()


Unnamed: 0,MEASURE:Migration type,SEX_ABS:Sex,AGE:Age,ASGS_2011:Region,FREQUENCY:Frequency,TIME:Time,Value,Flag Codes:Flags
0,1:NOM arrival,1:Males,A04:0 - 4,0:Australia,A:Annual,2004:2004,10080,
1,1:NOM arrival,1:Males,A04:0 - 4,0:Australia,A:Annual,2005:2005,10680,
2,1:NOM arrival,1:Males,A04:0 - 4,0:Australia,A:Annual,2006:2006,11840,
3,1:NOM arrival,1:Males,A04:0 - 4,0:Australia,A:Annual,2007:2007,13250,
4,1:NOM arrival,1:Males,A04:0 - 4,0:Australia,A:Annual,2008:2008,15470,


# Cleaning data - Drop collumn

In [2]:
# Drop the last collumn "Flag Codes" - not needed
migration_data = migration_data.drop(columns=['Flag Codes:Flags'])

In [3]:
migration_data.head()

Unnamed: 0,MEASURE:Migration type,SEX_ABS:Sex,AGE:Age,ASGS_2011:Region,FREQUENCY:Frequency,TIME:Time,Value
0,1:NOM arrival,1:Males,A04:0 - 4,0:Australia,A:Annual,2004:2004,10080
1,1:NOM arrival,1:Males,A04:0 - 4,0:Australia,A:Annual,2005:2005,10680
2,1:NOM arrival,1:Males,A04:0 - 4,0:Australia,A:Annual,2006:2006,11840
3,1:NOM arrival,1:Males,A04:0 - 4,0:Australia,A:Annual,2007:2007,13250
4,1:NOM arrival,1:Males,A04:0 - 4,0:Australia,A:Annual,2008:2008,15470


# Cleaning data - Rename collumns

In [4]:
migration_data = migration_data.rename(columns = {
    'MEASURE:Migration type': "Migration type", 
    'SEX_ABS:Sex' : "Sex", 
    'AGE:Age' : "Age", 
    'ASGS_2011:Region' : "Region",
    'FREQUENCY:Frequency' : "Frequency", 
    'TIME:Time' : "Year"})

In [5]:
migration_data.columns

Index(['Migration type', 'Sex', 'Age', 'Region', 'Frequency', 'Year', 'Value'], dtype='object')

# Cleaning data - Removing and extracting unwanted string or numbers from collumns

In [6]:
# MIGRATION DATA
migration_data['Migration type'] = migration_data['Migration type'].str.lstrip('123:')
migration_data['Migration type']

0        NOM arrival
1        NOM arrival
2        NOM arrival
3        NOM arrival
4        NOM arrival
            ...     
19435            NOM
19436            NOM
19437            NOM
19438            NOM
19439            NOM
Name: Migration type, Length: 19440, dtype: object

In [7]:
# SEX 
migration_data['Sex'] = migration_data['Sex'].str.lstrip('123:')
migration_data['Sex']

0          Males
1          Males
2          Males
3          Males
4          Males
          ...   
19435    Persons
19436    Persons
19437    Persons
19438    Persons
19439    Persons
Name: Sex, Length: 19440, dtype: object

In [10]:
migration_data['Age'] = migration_data.Age.str.split(':').apply(lambda x: x[1])


In [11]:
migration_data['Age']

0              0 - 4
1              0 - 4
2              0 - 4
3              0 - 4
4              0 - 4
            ...     
19435    65 and over
19436    65 and over
19437    65 and over
19438    65 and over
19439    65 and over
Name: Age, Length: 19440, dtype: object

In [13]:
# REGION
migration_data['Region'] = migration_data['Region'].str.lstrip('012345678:')
migration_data['Region']

0                           Australia
1                           Australia
2                           Australia
3                           Australia
4                           Australia
                     ...             
19435    Australian Capital Territory
19436    Australian Capital Territory
19437    Australian Capital Territory
19438    Australian Capital Territory
19439    Australian Capital Territory
Name: Region, Length: 19440, dtype: object

In [None]:
# Add the abbreviations for each capital i.e. WA, NSW, ACT, VIC, NT, TAS, QLD, AUS

In [14]:
# FREQUENCY
migration_data['Frequency'] = migration_data['Frequency'].str.replace('A:Annual', 'Annual')
migration_data['Frequency']

0        Annual
1        Annual
2        Annual
3        Annual
4        Annual
          ...  
19435    Annual
19436    Annual
19437    Annual
19438    Annual
19439    Annual
Name: Frequency, Length: 19440, dtype: object

In [15]:
# TIME
migration_data['Year'] = migration_data.Year.str.split(':').apply(lambda x: x[1])
migration_data['Year']

0        2004
1        2005
2        2006
3        2007
4        2008
         ... 
19435    2015
19436    2016
19437    2017
19438    2018
19439    2019
Name: Year, Length: 19440, dtype: object

# Aggregating datasets 

In [16]:
migration_data.head()

Unnamed: 0,Migration type,Sex,Age,Region,Frequency,Year,Value
0,NOM arrival,Males,0 - 4,Australia,Annual,2004,10080
1,NOM arrival,Males,0 - 4,Australia,Annual,2005,10680
2,NOM arrival,Males,0 - 4,Australia,Annual,2006,11840
3,NOM arrival,Males,0 - 4,Australia,Annual,2007,13250
4,NOM arrival,Males,0 - 4,Australia,Annual,2008,15470


In [22]:
net_migration = migration_data.groupby(['Migration type', 'Region', 'Year']).sum()['Value']
net_migration

Migration type  Region             Year
NOM             Australia          2004     555230
                                   2005     628770
                                   2006     789950
                                   2007     976130
                                   2008    1262810
                                            ...   
NOM departure   Western Australia  2015     158920
                                   2016     154620
                                   2017     136150
                                   2018     120640
                                   2019     116270
Name: Value, Length: 432, dtype: int64