# Data Wrangling Template

## Gather

In [12]:
import zipfile 
import pandas as pd

In [10]:
#unzipping the data file
with zipfile.ZipFile('archive.zip','r') as zp :
    zp.extractall()

In [13]:
# reading the csv file into pandas dataframe
df = pd.read_csv('online-job-postings.csv')

## Assess

In [39]:
df.tail()

Unnamed: 0,jobpost,date,Title,Company,AnnouncementCode,Term,Eligibility,Audience,StartDate,Duration,...,Salary,ApplicationP,OpeningDate,Deadline,Notes,AboutC,Attach,Year,Month,IT
18996,Technolinguistics NGO\r\n\r\n\r\nTITLE: Senio...,"Dec 28, 2015",Senior Creative UX/ UI Designer,Technolinguistics NGO,,Full-time,,,,Long-term,...,Competitive,"To apply for this position, please send your\r...",29 December 2015,28 January 2016,,As a company Technolinguistics has a mandate t...,,2015,12,False
18997,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Category Development Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested professionals.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,30 December 2015,20 January 2016,,,,2015,12,False
18998,"""Coca-Cola Hellenic Bottling Company Armenia"" ...","Dec 30, 2015",Operational Marketing Manager,"""Coca-Cola Hellenic Bottling Company Armenia"" ...",,Full-time,All interested professionals.,,ASAP,Long-term with a probation period of 3 months.,...,,All interested candidates are kindly requested...,30 December 2015,20 January 2016,,,,2015,12,False
18999,San Lazzaro LLC\r\n\r\n\r\nTITLE: Head of O...,"Dec 30, 2015",Head of Online Sales Department,San Lazzaro LLC,,,,,,Long-term,...,Highly competitive,Interested candidates can send their CVs to:\r...,30 December 2015,29 January 2016,,San Lazzaro LLC works with several internation...,,2015,12,False
19000,"""Kamurj"" UCO CJSC\r\n\r\n\r\nTITLE: Lawyer in...","Dec 30, 2015",Lawyer in Legal Department,"""Kamurj"" UCO CJSC",,Full-time,,,,Indefinite,...,,All qualified applicants are encouraged to\r\n...,30 December 2015,20 January 2016,,"""Kamurj"" UCO CJSC is providing micro and small...",,2015,12,False


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19001 entries, 0 to 19000
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   jobpost           19001 non-null  object
 1   date              19001 non-null  object
 2   Title             18973 non-null  object
 3   Company           18994 non-null  object
 4   AnnouncementCode  1208 non-null   object
 5   Term              7676 non-null   object
 6   Eligibility       4930 non-null   object
 7   Audience          640 non-null    object
 8   StartDate         9675 non-null   object
 9   Duration          10798 non-null  object
 10  Location          18969 non-null  object
 11  JobDescription    15109 non-null  object
 12  JobRequirment     16479 non-null  object
 13  RequiredQual      18517 non-null  object
 14  Salary            9622 non-null   object
 15  ApplicationP      18941 non-null  object
 16  OpeningDate       18295 non-null  object
 17  Deadline    

In [16]:
df.nunique()

jobpost             18892
date                 4391
Title                8636
Company              4554
AnnouncementCode     1014
Term                  411
Eligibility           663
Audience              216
StartDate            1186
Duration             1515
Location              759
JobDescription      12861
JobRequirment       14182
RequiredQual        16688
Salary               2692
ApplicationP        14187
OpeningDate          3344
Deadline             5202
Notes                1031
AboutC               6016
Attach               1495
Year                   12
Month                  12
IT                      2
dtype: int64

In [17]:
df.dtypes

jobpost             object
date                object
Title               object
Company             object
AnnouncementCode    object
Term                object
Eligibility         object
Audience            object
StartDate           object
Duration            object
Location            object
JobDescription      object
JobRequirment       object
RequiredQual        object
Salary              object
ApplicationP        object
OpeningDate         object
Deadline            object
Notes               object
AboutC              object
Attach              object
Year                 int64
Month                int64
IT                    bool
dtype: object

In [18]:
df.columns

Index(['jobpost', 'date', 'Title', 'Company', 'AnnouncementCode', 'Term',
       'Eligibility', 'Audience', 'StartDate', 'Duration', 'Location',
       'JobDescription', 'JobRequirment', 'RequiredQual', 'Salary',
       'ApplicationP', 'OpeningDate', 'Deadline', 'Notes', 'AboutC', 'Attach',
       'Year', 'Month', 'IT'],
      dtype='object')

In [21]:
df.index

RangeIndex(start=0, stop=19001, step=1)

In [22]:
sum(df.duplicated())

39

In [25]:
df['Year'].value_counts()

2012    2149
2015    2009
2013    2009
2014    1983
2008    1785
2011    1697
2007    1538
2010    1511
2009    1191
2005    1138
2006    1116
2004     875
Name: Year, dtype: int64

### quality

- Missing values (NaN)
- StartDate inconsistencies (ASAP)
- Inconsistent representations of values, specifically "As soon as possible" and other similar values to "ASAP" in the StartDate column for this dataset


### Tideness

Nondescriptive column headers (ApplicationP, AboutC, RequiredQual ... and also JobRequirment)



## Clean

### define 

- StartDate inconsistencies (ASAP) to be one thing
- the non descriptive column headers

#### issue 1 

- non descriptive columns

#### code


In [51]:
clean_df = df.copy()

- Select all nondescriptive and misspelled column headers (ApplicationP, AboutC, RequiredQual, JobRequirment) and replace them with full words (ApplicationProcedure, AboutCompany, RequiredQualifications, JobRequirement)

In [52]:
clean_df.rename(columns = {'ApplicationP':'ApplicationProcedure' , 'RequiredQual':'RequiredQualifications' ,
                           'AboutC':'AboutCompany','JobRequirment':'JobRequirement'},inplace = True)

#### Test

In [53]:
clean_df.columns

Index(['jobpost', 'date', 'Title', 'Company', 'AnnouncementCode', 'Term',
       'Eligibility', 'Audience', 'StartDate', 'Duration', 'Location',
       'JobDescription', 'JobRequirement', 'RequiredQualifications', 'Salary',
       'ApplicationProcedure', 'OpeningDate', 'Deadline', 'Notes',
       'AboutCompany', 'Attach', 'Year', 'Month', 'IT'],
      dtype='object')

#### Issue 2 

StartDate inconsistencies (ASAP) to be one thing

#### code

In [59]:
asap_list = ['Immediately', 'As soon as possible', 'Upon hiring',
             'Immediate', 'Immediate employment', 'As soon as possible.', 'Immediate job opportunity',
             '"Immediate employment, after passing the interview."',
             'ASAP preferred', 'Employment contract signature date',
             'Immediate employment opportunity', 'Immidiately', 'ASA',
             'Asap', '"The position is open immediately but has a flexible start date depending on the candidates earliest availability."',
             'Immediately upon agreement', '20 November 2014 or ASAP',
             'immediately', 'Immediatelly',
             '"Immediately upon selection or no later than November 15, 2009."',
             'Immediate job opening', 'Immediate hiring', 'Upon selection',
             'As soon as practical', 'Immadiate', 'As soon as posible',
             'Immediately with 2 months probation period',
             '12 November 2012 or ASAP', 'Immediate employment after passing the interview',
             'Immediately/ upon agreement', '01 September 2014 or ASAP',
             'Immediately or as per agreement', 'as soon as possible',
             'As soon as Possible', 'in the nearest future', 'immediate',
             '01 April 2014 or ASAP', 'Immidiatly', 'Urgent',
             'Immediate or earliest possible', 'Immediate hire',
             'Earliest  possible', 'ASAP with 3 months probation period.',
             'Immediate employment opportunity.', 'Immediate employment.',
             'Immidietly', 'Imminent', 'September 2014 or ASAP', 'Imediately']

for i in asap_list :
    clean_df['StartDate'].replace(i,'ASAP')

#### Test

In [64]:
clean_df['StartDate'].value_counts()

ASAP                       6856
01 September 2012            31
March 2006                   27
November 2006                22
January 2010                 19
                           ... 
09 October 2006               1
13 April 2009                 1
03 March 2005                 1
The end of January 2005       1
18 June 2012                  1
Name: StartDate, Length: 1140, dtype: int64

In [66]:
#save the file 

clean_df.to_csv('online-job-postings.csv')