## Data anlaysis task

In [359]:
import pandas as pd

In [360]:
import re

In [361]:
df_admin = pd.read_excel('NZ_Admin_JOBS.xlsx', header= None)

### 1.1 Filling missing value

In [362]:
df_admin.isna().sum()

0     0
1     0
2    22
3     0
4     0
5     0
dtype: int64

In [363]:
df_admin.fillna('unknown', inplace = True)

### 1.2  drop duplicate rows

In [364]:
df_admin.drop_duplicates(inplace = True)

### 2.  split column 3 into location and area

In [365]:
df_admin[['location','area']] = df_admin[3].str.split('area:', expand = True)

In [366]:
df_admin.drop(3, axis = 1, inplace = True)

#### fillna for the area column

In [367]:
df_admin.isna().sum()

0             0
1             0
2             0
4             0
5             0
location      0
area        541
dtype: int64

In [368]:
df_admin['area'].fillna('unknown', inplace = True)

#### modify location column -- get rid of 'location:'

In [369]:
def apply_location(x):
    return x[9:]

In [370]:
df_admin['location'] = df_admin['location'].apply(apply_location)

#### modify location -- get rid of repeated text

In [371]:
def apply_repeated(x):
    middle = len(x)//2
    if x != 'unknown':
        return x[:middle + 1]
    else:
        return 'unknown'

In [372]:
df_admin['location'] = df_admin['location'].apply(apply_repeated)

#### clean the area column (get rid of unrelated information)

In [373]:
df_admin['area'] = df_admin['area'].str.split(',', expand = True)[0]

In [374]:
df_admin['area'] = df_admin['area'].apply(apply_repeated)

### 3. split column 5 into classification/Classification

In [375]:
df_admin[['classification','Classification']] = df_admin[5].str.split('Classification:', expand = True)

In [376]:
df_admin.drop(5, axis = 1, inplace = True)

#### modify the classification column to get rid of "classification:"

In [377]:
def apply_class(x, index):
    if '$' not in x:
        return x[index:]
    else:
        return x

In [378]:
#df_admin['classification'] = df_admin['classification'].apply(apply_class, args = (15,))

#### fillna for the column "Classification"

In [379]:
df_admin.isna().sum()

0                   0
1                   0
2                   0
4                   0
location            0
area                0
classification      0
Classification    603
dtype: int64

In [380]:
df_admin['Classification'].fillna('unknown', inplace = True)

#### modify the coloumn Classification - get rid of repeated text

In [381]:
df_admin['Classification'] = df_admin['Classification'].apply(apply_repeated)

### 4. split column 4 into posted time and featured at

In [382]:
df_admin[['posted time','featured at']] = df_admin[4].str.split(',', expand = True)[[0,2]]

In [383]:
df_admin.drop(4, axis = 1, inplace = True)

#### fillna for featured at

In [384]:
df_admin['featured at'].fillna('unknown', inplace = True)

### 5. create a new column "salary"

In [385]:
def apply_salary(x):
    if 'classification:' in x:
        return 'unknown'
    else:
        return x

In [386]:
df_admin['salary'] = df_admin['classification'].apply(apply_salary)

### 6. get rid of unrelated information in the classification content

In [387]:
def apply_classification(x):
    if 'classification:' in x:
        return x[15:]
    else:
        return 'unknown'

In [388]:
df_admin['classification'] = df_admin['classification'].apply(apply_classification)

In [389]:
df_admin.rename(columns={0:'Job Title', 1:'from website', 2:'company name'}, inplace = True)

## Classification: seperate the categries connected with '&'

In [390]:
df_admin[['classification1','classification2','classification3']] = df_admin['classification'].str.split('&', expand = True).fillna('unknown')

In [391]:
df_admin.drop('classification', axis = 1, inplace = True)

In [392]:
df_admin

Unnamed: 0,Job Title,from website,company name,location,area,Classification,posted time,featured at,salary,classification1,classification2,classification3
0,Administrator,https://www.seek.co.nz/job/50582301?type=promo...,unknown,Bay of Plenty,Tauranga,Office Management,Featured,Private Advertiser,unknown,Administration,Office SupportAdministration,Office Supportsub
1,Receptionist,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,Bay of Plenty,Tauranga,Receptionists,Featured,unknown,unknown,Administration,Office SupportAdministration,Office Supportsub
2,Prosecutions Support Officer,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,Auckland,unknown,Other,4d ago,unknown,unknown,Administration,Office SupportAdministration,Office Supportsub
3,Early Childhood Centre Administrator,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,Southland,Invercargill,Administrative Assistants,1h ago,unknown,unknown,Administration,Office SupportAdministration,Office Supportsub
4,Business Support Administrator,https://www.seek.co.nz/job/50622432?type=stand...,unknown,Canterbury,Christchurch,Client & Sales Administration,4d ago,Private Advertiser,unknown,Administration,Office SupportAdministration,Office Supportsub
...,...,...,...,...,...,...,...,...,...,...,...,...
2703,Key Account Manager,https://www.seek.co.nz/job/50490062?type=stand...,Hays Talent Solutions,Auckland,Auckland Central,Client & Sales Administration,27d ago,unknown,unknown,Administration,Office SupportAdministration,Office Supportsub
2704,Executive Assistant,https://www.seek.co.nz/job/50488000?type=stand...,one eighty recruitment,Wellington,Wellington Central,unknown,27d ago,unknown,Competitive hourly rate,unknown,unknown,unknown
2705,Temporary Office Roles,https://www.seek.co.nz/job/50524865?type=stand...,Asset Recruitment Ltd,Waikato,Hamilton,unknown,20d ago,unknown,Competitive hourly rates $$,unknown,unknown,unknown
2706,Temporary Office Roles,https://www.seek.co.nz/job/50477118?type=stand...,Asset Recruitment Ltd,Waikato,Hamilton,unknown,28d ago,unknown,Competitive hourly rates $$,unknown,unknown,unknown


##  Salary: provide salary expectation, eg: unknown salary -> average salary rate

## 先把所有非数字的值置换成x + '-'

In [393]:
def apply_salary(x):
    if len(re.findall(r'\d+', x)) == 0:
        return 'unknown'
    else:
        return x

In [394]:
re.findall(r'\d+.*\d+','20')[0]

'20'

In [395]:
df_admin['salary'] = df_admin['salary'].apply(apply_salary)

In [396]:
def generate_max_salary(x):
    value = re.findall(r'\d+\.?\d+',x)
    if len(value) > 1:
        return value[1]
    else:
        return 'unknown'

In [397]:
df_admin['max_salary'] = df_admin['salary'].apply(generate_max_salary)

In [401]:
def apply_salary1(x):
    if len(re.findall(r'\d+\.?\d+', x)):
        value = re.findall(r'\d+\.?\d+',x)[0]
        if float(re.sub(r'[ \,]+','',value)) < 2000:
            return float(re.sub(r'[ \,]+','',value))*8*200
        return re.sub(r'[ \,]+','',value)
    else:
        return 0

In [402]:
df_admin['min_salary'] = df_admin['salary']

In [406]:
df_admin['min_salary'] = df_admin['min_salary'].apply(apply_salary1)

In [409]:
df_admin['max_salary'] = df_admin['max_salary'].apply(apply_salary1)

##### drop掉df_admin['salary']

In [410]:
df_admin.drop('salary', axis = 1, inplace = True)

##### 求出该列平均值

In [411]:
def compute_mean(x):
    index = 0
    sumvalue = 0
    for i in x:
        if float(i) == 0:
            continue
        else:
            sumvalue += float(i)
            index += 1
    return sumvalue/index

##### 工资这里有不同的单位 有时薪 有年薪

In [412]:
min_salary_mean = compute_mean(df_admin['min_salary'])

In [416]:
max_salary_mean = compute_mean(df_admin['max_salary'])

In [417]:
def fillvalue(x, mean):
    if x == 0:
        return mean
    else:
        return x

In [418]:
df_admin['min_salary'] = df_admin['min_salary'].apply(fillvalue, args = (min_salary_mean,))

In [419]:
df_admin['max_salary'] = df_admin['max_salary'].apply(fillvalue, args = (max_salary_mean,))

##  Convert posted_time to integer

In [420]:
#convert to day for every cell

In [421]:
df_admin['posted time']

0       Featured
1       Featured
2         4d ago
3         1h ago
4         4d ago
          ...   
2703     27d ago
2704     27d ago
2705     20d ago
2706     28d ago
2707     26d ago
Name: posted time, Length: 2708, dtype: object

In [422]:
for i in df_admin['posted time']:
    print(i)

Featured
Featured
4d ago
1h ago
4d ago
9m ago
5d ago
7m ago
2h ago
3d ago
7d ago
5d ago
8d ago
3d ago
3d ago
5d ago
4h ago
6d ago
3d ago
7d ago
2d ago
3d ago
Featured
Featured
3d ago
4d ago
2d ago
3d ago
3d ago
4h ago
6d ago
3d ago
4d ago
5d ago
3d ago
5d ago
4d ago
5h ago
4d ago
5d ago
3d ago
3d ago
4h ago
3d ago
Featured
Featured
4d ago
4h ago
3d ago
4d ago
3d ago
7d ago
6d ago
7d ago
6d ago
6d ago
3d ago
6d ago
5d ago
5d ago
11d ago
5d ago
3d ago
8d ago
5h ago
3d ago
Featured
Featured
5d ago
3d ago
6d ago
6d ago
7d ago
3d ago
4d ago
3d ago
3d ago
6d ago
6d ago
5d ago
5d ago
3d ago
3d ago
4d ago
6d ago
5d ago
4d ago
7d ago
Featured
Featured
7d ago
2h ago
5d ago
5d ago
6d ago
11d ago
4d ago
5d ago
3d ago
10d ago
3d ago
3d ago
3d ago
6d ago
7d ago
6d ago
10d ago
5d ago
6d ago
3d ago
Featured
Featured
8d ago
11d ago
4d ago
7d ago
7d ago
5d ago
4d ago
6d ago
12d ago
5d ago
5d ago
5d ago
3d ago
3d ago
7d ago
4d ago
5d ago
6d ago
7d ago
3d ago
Featured
Featured
6d ago
5d ago
7d ago
5d ago


28d ago
27d ago
28d ago
28d ago
28d ago
25d ago
26d ago
21d ago
26d ago
26d ago
14d ago
27d ago
27d ago
27d ago
20d ago
28d ago
26d ago
27d ago
25d ago
4d ago
27d ago
7d ago
30d ago
24d ago
28d ago
27d ago
27d ago
20d ago
28d ago
27d ago
28d ago
28d ago
28d ago
25d ago
26d ago
21d ago
26d ago
26d ago
14d ago
27d ago
27d ago
27d ago
20d ago
28d ago
26d ago
27d ago
25d ago
4d ago
27d ago
7d ago
30d ago
24d ago
28d ago
27d ago
27d ago
20d ago
28d ago
27d ago
28d ago
28d ago
28d ago
25d ago
26d ago
21d ago
26d ago
26d ago
14d ago
27d ago
27d ago
27d ago
20d ago
28d ago
26d ago
21d ago
26d ago
18d ago
17d ago
26d ago
28d ago
28d ago
20d ago
30d+ ago
11d ago
25d ago
26d ago
28d ago
25d ago
19d ago
26d ago
18d ago
25d ago
12d ago
24d ago
24d ago
27d ago
25d ago
4d ago
27d ago
7d ago
30d ago
24d ago
28d ago
27d ago
27d ago
20d ago
28d ago
27d ago
28d ago
28d ago
28d ago
25d ago
26d ago
21d ago
26d ago
26d ago
14d ago
27d ago
27d ago
27d ago
20d ago
28d ago
26d ago
24d ago
27d ago
25d ago
4d ag

In [423]:
re.findall(r'\d+', '7d ago')[0]

'7'

In [424]:
def apply_posted_time(x):
    if x == 'Featured':
        return 'unKnown'
    if 'm' in x:
        return float(re.findall(r'\d+', x)[0])*30
    elif 'h' in x:
        return float(re.findall(r'\d+', x)[0])/24
    elif 'd' in x:
        return float(re.findall(r'\d+', x)[0])

In [425]:
df_admin['posted time'] = df_admin['posted time'].apply(apply_posted_time)

In [426]:
def compute_mean(x):
    index = 0
    sumvalue = 0
    for i in x:
        if i != 'unKnown':
            sumvalue += float(i)
            index += 1  
    return sumvalue/index

In [427]:
compute_mean(df_admin['posted time'])

21.938058035714285

In [428]:
df_admin['posted time'] = df_admin['posted time'].replace('unKnown',compute_mean(df_admin['posted time']))

## Featured at: the column is not very meaningful, try to merge this column to company name column

In [429]:
df_admin['company name'] = df_admin['company name'].replace('unknown','Private Advertiser')

In [430]:
df_admin.drop('featured at', axis = 1, inplace = True)

## 6. Change dtypes from object to a proper data type (integer, string, double, etc.)

In [431]:
df_admin['Job Title'] = df_admin['Job Title'].astype('string')

In [432]:
df_admin['company name'] = df_admin['company name'].astype('string')

In [433]:
df_admin['location'] = df_admin['location'].astype('string')

In [434]:
df_admin['area'] = df_admin['area'].astype('string')

In [435]:
df_admin['Classification'] = df_admin['Classification'].astype('string')

In [436]:
df_admin['classification1'] = df_admin['classification1'].astype('string')

In [437]:
df_admin['classification2'] = df_admin['classification2'].astype('string')

In [438]:
df_admin['classification3'] = df_admin['classification3'].astype('string')

In [439]:
df_admin['posted time'] = df_admin['posted time'].astype('int')

In [440]:
df_admin['max_salary'] = df_admin['max_salary'].astype('float')

In [441]:
df_admin['min_salary'] = df_admin['min_salary'].astype('float')

In [442]:
df_admin.dtypes

Job Title           string
from website        object
company name        string
location            string
area                string
Classification      string
posted time          int64
classification1     string
classification2     string
classification3     string
max_salary         float64
min_salary         float64
dtype: object

In [443]:
df_admin

Unnamed: 0,Job Title,from website,company name,location,area,Classification,posted time,classification1,classification2,classification3,max_salary,min_salary
0,Administrator,https://www.seek.co.nz/job/50582301?type=promo...,Private Advertiser,Bay of Plenty,Tauranga,Office Management,21,Administration,Office SupportAdministration,Office Supportsub,103288.156863,57564.210526
1,Receptionist,https://www.seek.co.nz/job/50620889?type=promo...,Avenues Orthodontics,Bay of Plenty,Tauranga,Receptionists,21,Administration,Office SupportAdministration,Office Supportsub,103288.156863,57564.210526
2,Prosecutions Support Officer,https://www.seek.co.nz/job/50622169?type=stand...,New Zealand Police,Auckland,unknown,Other,4,Administration,Office SupportAdministration,Office Supportsub,103288.156863,57564.210526
3,Early Childhood Centre Administrator,https://www.seek.co.nz/job/50639620?type=stand...,Kew Pacific Island Early Learning Centre,Southland,Invercargill,Administrative Assistants,0,Administration,Office SupportAdministration,Office Supportsub,103288.156863,57564.210526
4,Business Support Administrator,https://www.seek.co.nz/job/50622432?type=stand...,Private Advertiser,Canterbury,Christchurch,Client & Sales Administration,4,Administration,Office SupportAdministration,Office Supportsub,103288.156863,57564.210526
...,...,...,...,...,...,...,...,...,...,...,...,...
2703,Key Account Manager,https://www.seek.co.nz/job/50490062?type=stand...,Hays Talent Solutions,Auckland,Auckland Central,Client & Sales Administration,27,Administration,Office SupportAdministration,Office Supportsub,103288.156863,57564.210526
2704,Executive Assistant,https://www.seek.co.nz/job/50488000?type=stand...,one eighty recruitment,Wellington,Wellington Central,unknown,27,unknown,unknown,unknown,103288.156863,57564.210526
2705,Temporary Office Roles,https://www.seek.co.nz/job/50524865?type=stand...,Asset Recruitment Ltd,Waikato,Hamilton,unknown,20,unknown,unknown,unknown,103288.156863,57564.210526
2706,Temporary Office Roles,https://www.seek.co.nz/job/50477118?type=stand...,Asset Recruitment Ltd,Waikato,Hamilton,unknown,28,unknown,unknown,unknown,103288.156863,57564.210526
