# Data Wrangling Template

## Gather

In [1]:
import zipfile
import pandas as pd

# Extract all contents from zip file
with zipfile.ZipFile('data/armenian-online-job-postings.zip', 'r') as myzip:
    myzip.extractall()

In [2]:
# Read CSV
df = pd.read_csv('online-job-postings.csv')

## Assess

In [6]:
df.head(10)

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
0,AMERIA Investment Consulting Company\r\nJOB TI...,"Jan 5, 2004",Chief Financial Officer,AMERIA Investment Consulting Company,,,,,,,...,,"To apply for this position, please submit a\r\...",,26 January 2004,,,,2004,1,False
1,International Research & Exchanges Board (IREX...,"Jan 7, 2004",Full-time Community Connections Intern (paid i...,International Research & Exchanges Board (IREX),,,,,,3 months,...,,Please submit a cover letter and resume to:\r\...,,12 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
2,Caucasus Environmental NGO Network (CENN)\r\nJ...,"Jan 7, 2004",Country Coordinator,Caucasus Environmental NGO Network (CENN),,,,,,Renewable annual contract\r\nPOSITION,...,,Please send resume or CV toursula.kazarian@......,,20 January 2004\r\nSTART DATE: February 2004,,The Caucasus Environmental NGO Network is a\r\...,,2004,1,False
3,Manoff Group\r\nJOB TITLE: BCC Specialist\r\n...,"Jan 7, 2004",BCC Specialist,Manoff Group,,,,,,,...,,Please send cover letter and resume to Amy\r\n...,,23 January 2004\r\nSTART DATE: Immediate,,,,2004,1,False
4,Yerevan Brandy Company\r\nJOB TITLE: Software...,"Jan 10, 2004",Software Developer,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"20 January 2004, 18:00",,,,2004,1,True
5,"Boutique ""Appollo""\r\nJOB TITLE: Saleswoman\r...","Jan 10, 2004",Saleswoman,"Boutique ""Appollo""",,,,,,,...,,"For further information, please contact Irina\...",,01 February 2004,,,,2004,1,False
6,OSI Assistance Foundation - Armenian Branch Of...,"Jan 11, 2004",Chief Accountant/ Finance Assistant,OSI Assistance Foundation - Armenian Branch Of...,,,,,,,...,,"For submission of applications/ CVs, please\r\...",,"16 January 2004, 6:00 pm.",,,,2004,1,False
7,International Research & Exchanges Board (IREX...,"Jan 13, 2004",Non-paid part or full time Programmatic Intern,International Research & Exchanges Board (IREX),,,,,,6 months,...,,"To apply, please download and submit the\r\nap...",,16 January 2004,,The International Research & Exchanges Board (...,,2004,1,False
8,Yerevan Brandy Company \r\nJOB TITLE: Assista...,"Jan 13, 2004",Assistant to Managing Director,Yerevan Brandy Company,,,,,,,...,,Successful candidates should submit\r\n- CV; \...,,"27 January 2004, 18:00",,,,2004,1,False
9,American Embassy Yerevan\r\nANNOUNCEMENT NUMBE...,"Jan 13, 2004","Program Assistant (INL), FSN-8; FP-6*",American Embassy Yerevan\r\nANNOUNCEMENT NUMBE...,,,,,,,...,,Interested candidates for this position should...,,26 January 2004 \r\nDrafted: GSargsyan\r\nC...,,,,2004,1,False


- Missing values (NaN)
- StartDate inconsistencies

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
jobpost             19001 non-null object
date                19001 non-null object
Title               18973 non-null object
Company             18994 non-null object
AnnouncementCode    1208 non-null object
Term                7676 non-null object
Eligibility         4930 non-null object
Audience            640 non-null object
StartDate           9675 non-null object
Duration            10798 non-null object
Location            18969 non-null object
JobDescription      15109 non-null object
JobRequirment       16479 non-null object
RequiredQual        18517 non-null object
Salary              9622 non-null object
ApplicationP        18941 non-null object
OpeningDate         18295 non-null object
Deadline            18936 non-null object
Notes               2211 non-null object
AboutC              12470 non-null object
Attach              1559 non-null object
Year              

- Nondescriptive column headers

## Clean

#### Define



#### Code

In [7]:
df_clean = df.copy()

- Rename nondescriptive column headers

In [8]:
df_clean.rename(columns={'ApplicationP': 'ApplicationProcedure', 
                         'AboutC': 'AboutCompany', 
                         'RequiredQual': 'RequiredQualifications', 
                         'JobRequirment': 'JobRequirement'}, inplace=True)

- Replace every value indicating ASAP for "ASAP"

In [10]:
asap_list = ['Immediately', 'As soon as possible', 'Upon hiring',
             'Immediate', 'Immediate employment', 'As soon as possible.', 'Immediate job opportunity',
             '"Immediate employment, after passing the interview."',
             'ASAP preferred', 'Employment contract signature date',
             'Immediate employment opportunity', 'Immidiately', 'ASA',
             'Asap', '"The position is open immediately but has a flexible start date depending on the candidates earliest availability."',
             'Immediately upon agreement', '20 November 2014 or ASAP',
             'immediately', 'Immediatelly',
             '"Immediately upon selection or no later than November 15, 2009."',
             'Immediate job opening', 'Immediate hiring', 'Upon selection',
             'As soon as practical', 'Immadiate', 'As soon as posible',
             'Immediately with 2 months probation period',
             '12 November 2012 or ASAP', 'Immediate employment after passing the interview',
             'Immediately/ upon agreement', '01 September 2014 or ASAP',
             'Immediately or as per agreement', 'as soon as possible',
             'As soon as Possible', 'in the nearest future', 'immediate',
             '01 April 2014 or ASAP', 'Immidiatly', 'Urgent',
             'Immediate or earliest possible', 'Immediate hire',
             'Earliest  possible', 'ASAP with 3 months probation period.',
             'Immediate employment opportunity.', 'Immediate employment.',
             'Immidietly', 'Imminent', 'September 2014 or ASAP', 'Imediately']

for i in asap_list:
    df_clean.StartDate.replace(i, 'ASAP' , inplace=True)

#### Test

In [15]:
df_clean.tail(10)

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationProcedure,OpeningDate,Deadline,Notes,AboutCompany,Attach,Year,Month,IT
18991,NASDAQ OMX Armenia OJSC\r\n\r\n\r\nTITLE: C/ ...,"Dec 24, 2015",C/ C++ Developer,NASDAQ OMX Armenia OJSC,,Full-time,,,,,...,Competitive,Interested candidates can submit their CVs to:...,25 December 2015,24 January 2016,,"For more information, please visit: www.nasdaq...",,2015,12,True
18992,Macadamian AR CJSC\r\n\r\n\r\nTITLE: .NET Dev...,"Dec 25, 2015",.NET Developer,Macadamian AR CJSC,,,,,,,...,,"To apply for this position, please email your ...",25 December 2015,24 January 2016,,Macadamian AR is a global software development...,,2015,12,True
18993,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 25, 2015",Recruitment Specialist,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested candidates.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,25 December 2015,24 January 2016,,,,2015,12,False
18994,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 25, 2015",Capability Development Specialist,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,,All interested candidates.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,25 December 2015,24 January 2016,,,,2015,12,False
18995,"""Transport PIU"" State Institution of the RA Mi...","Dec 24, 2015",Deputy Director,"""Transport PIU"" State Institution of the RA Mi...",,,All interested and qualified candidates.,,ASAP,Long-term with a probation period of 3 months.,...,Commensurate with skills and experience.,Interested candidates are asked to submit the\...,25 December 2015,"24 January 2016, 17:00 p.m.",,"The ""Transport PIU"" SI (PIU) of the RA Ministr...",,2015,12,False
18996,Technolinguistics NGO\r\n\r\n\r\nTITLE: Senio...,"Dec 28, 2015",Senior Creative UX/ UI Designer,Technolinguistics NGO,,Full-time,,,,Long-term,...,Competitive,"To apply for this position, please send your\r...",29 December 2015,28 January 2016,,As a company Technolinguistics has a mandate t...,,2015,12,False
18997,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Category Development Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested professionals.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,30 December 2015,20 January 2016,,,,2015,12,False
18998,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Operational Marketing Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested professionals.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,30 December 2015,20 January 2016,,,,2015,12,False
18999,San Lazzaro LLC\r\n\r\n\r\nTITLE: Head of O...,"Dec 30, 2015",Head of Online Sales Department,San Lazzaro LLC,,,,,,Long-term,...,Highly competitive,Interested candidates can send their CVs to:\r...,30 December 2015,29 January 2016,,San Lazzaro LLC works with several internation...,,2015,12,False
19000,"""Kamurj"" UCO CJSC\r\n\r\n\r\nTITLE: Lawyer in...","Dec 30, 2015",Lawyer in Legal Department,"""Kamurj"" UCO CJSC",,Full-time,,,,Indefinite,...,,All qualified applicants are encouraged to\r\n...,30 December 2015,20 January 2016,,"""Kamurj"" UCO CJSC is providing micro and small...",,2015,12,False


In [16]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
jobpost                   19001 non-null object
date                      19001 non-null object
Title                     18973 non-null object
Company                   18994 non-null object
AnnouncementCode          1208 non-null object
Term                      7676 non-null object
Eligibility               4930 non-null object
Audience                  640 non-null object
StartDate                 9675 non-null object
Duration                  10798 non-null object
Location                  18969 non-null object
JobDescription            15109 non-null object
JobRequirement            16479 non-null object
RequiredQualifications    18517 non-null object
Salary                    9622 non-null object
ApplicationProcedure      18941 non-null object
OpeningDate               18295 non-null object
Deadline                  18936 non-null object
Notes                     2211 non

In [17]:
df_clean.StartDate.value_counts()

ASAP                                                             6856
01 September 2012                                                  31
March 2006                                                         27
November 2006                                                      22
January 2010                                                       19
February 2014                                                      17
01 February 2005                                                   17
TBD                                                                16
September 2010                                                     16
February 2011                                                      16
March 2011                                                         15
September 2008                                                     15
01 February 2015                                                   14
01 July 2014                                                       14
February 2007       

In [18]:
for i in asap_list:
    assert i not in df_clean.StartDate.values