## Import packages and csv file

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("https://raw.githubusercontent.com/wongwara/Jobseeker_Baymax/main/dataset/listings2019_2022.csv")

## Basic understanding the data set

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3902 entries, 0 to 3901
Data columns (total 52 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   jobId                  3902 non-null   int64  
 1   jobTitle               3902 non-null   object 
 2   jobClassification      3902 non-null   object 
 3   jobSubClassification   3902 non-null   object 
 4   advertiserName         3902 non-null   object 
 5   advertiserId           3902 non-null   int64  
 6   companyId              1067 non-null   float64
 7   companyName            1476 non-null   object 
 8   companyRating          1067 non-null   float64
 9   listingDate            3902 non-null   object 
 10  expiryDate             3902 non-null   object 
 11  teaser                 3374 non-null   object 
 12  nation                 3902 non-null   object 
 13  state                  3902 non-null   object 
 14  city                   3902 non-null   object 
 15  area

In [3]:
df.shape

(3902, 52)

In [4]:
df['state'].unique()

array(['New South Wales', 'Australian Capital Territory', 'Victoria',
       'Western Australia', 'Queensland', 'Northern Territories',
       'South Australia', 'Tasmania', 'UK & Ireland', 'Overseas',
       'Northern Territory'], dtype=object)

In [None]:
df.columns

Index(['jobId', 'jobTitle', 'jobClassification', 'jobSubClassification',
       'advertiserName', 'advertiserId', 'companyId', 'companyName',
       'companyRating', 'listingDate', 'expiryDate', 'teaser', 'nation',
       'state', 'city', 'area', 'suburb', 'workType', 'salary_string',
       'isRightToWorkRequired', 'desktopAdTemplate', 'mobileAdTemplate',
       'companyProfileUrl', 'seekJobListingUrl', 'R', 'Python', 'Matlab',
       'SQL', 'Stata', 'Minitab', 'SPSS', 'Ruby', 'C', 'Scala', 'Tableau',
       'Java', 'Hadoop', 'SAS', 'Julia', 'Knime', 'D3', 'Clojure', 'Haskell',
       'Lisp', 'Golang', 'Spark', 'Javascript', 'F.', 'Fortran', 'first_seen',
       'last_seen', 'recruiter'],
      dtype='object')

In [None]:
df.head()

## Jobs release per week from 2019 January to 2022 January

In [None]:
df.index = pd.to_datetime(df['listingDate'])
monthly_count = df.resample('W').size()
monthly_count = monthly_count.reset_index(name = 'count')

plt.bar(monthly_count['listingDate'], monthly_count['count'], width = 6, color = ["#275e8e"])
plt.title('Job release amount per week')
plt.xticks(rotation = 45)
plt.show()

## Graph the different companies release the amount about the job vacancies.

In [None]:
job_vacancy = df['companyName'].value_counts()
job_vacancy_cleaned = pd.Series(job_vacancy.loc[job_vacancy >= 10])
plt.bar(job_vacancy_cleaned.index, job_vacancy_cleaned, color = ["#275e8e"])
plt.xticks(rotation = 90)
plt.subplots_adjust(bottom=0.3)
plt.title('Job release more than 10 vacancies in four years')
plt.show()

## Visualise Most Popular Programming Languages

In [None]:
import matplotlib.pyplot as plt
sum_programming = df.iloc[:, 24:49].sum()
sum_programming_cleaned = pd.Series(sum_programming.loc[sum_programming >= 50])
sum_programming_cleaned = sum_programming_cleaned.sort_values()

plt.barh(sum_programming_cleaned.index, sum_programming_cleaned, color = ["#7bc0ce", "#387a87", "#16525e","#a4b3b6", "#09353e"])
for i, v in enumerate(sum_programming_cleaned):
    plt.text(v, i, str(v))
plt.title('Most Common Programming Language for Data Scientist')
plt.figure(figsize = (12, 12))
plt.show()

## Every year language difference

In [None]:
from datetime import datetime
df['listingDate'] = pd.to_datetime(df['listingDate'], format='%Y-%m-%d %H:%M:%S')
df['listingYear'] = df['listingDate'].dt.strftime('%Y')


df_lang = df.iloc[:, 24:49]
sum_languages = df_lang.sum()
sum_languages = sum_languages.sort_values()
df_lang_cleaned = df_lang.loc[:, sum_languages >= 50]

grouped = df_lang_cleaned.groupby(df['listingYear']).sum()

print(grouped.index)

years = grouped.index

for year in years:
    values = grouped.loc[year]
    sorted_values = sorted(zip(values.values, values.index))
    sorted_languages = [x[1] for x in sorted_values]
    sorted_counts = [x[0] for x in sorted_values]
    
    plt.barh(sorted_languages, sorted_counts,color = ["#275e8e"])
    plt.title( str(year) + " frequent programming language")
    plt.show()
