`!pip install datasets`

The datasets (HuggingFace community-driven open-source library of datasets) link:
- https://pypi.org/project/datasets/
- https://huggingface.co/datasets

In [1]:
# pip install datasets

In [2]:
import pandas as pd
from datasets import load_dataset
import matplotlib.pyplot as plt

## Data Loading

In [3]:
# Load the data
dataset = load_dataset('lukebarousse/data_jobs')
dataset

DatasetDict({
    train: Dataset({
        features: ['job_title_short', 'job_title', 'job_location', 'job_via', 'job_schedule_type', 'job_work_from_home', 'search_location', 'job_posted_date', 'job_no_degree_mention', 'job_health_insurance', 'job_country', 'salary_rate', 'salary_year_avg', 'salary_hour_avg', 'company_name', 'job_skills', 'job_type_skills'],
        num_rows: 785741
    })
})

In [4]:
df = dataset['train'].to_pandas()

## Accesing Data

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   job_title_short        785741 non-null  object 
 1   job_title              785740 non-null  object 
 2   job_location           784696 non-null  object 
 3   job_via                785733 non-null  object 
 4   job_schedule_type      773074 non-null  object 
 5   job_work_from_home     785741 non-null  bool   
 6   search_location        785741 non-null  object 
 7   job_posted_date        785741 non-null  object 
 8   job_no_degree_mention  785741 non-null  bool   
 9   job_health_insurance   785741 non-null  bool   
 10  job_country            785692 non-null  object 
 11  salary_rate            33067 non-null   object 
 12  salary_year_avg        22003 non-null   float64
 13  salary_hour_avg        10662 non-null   float64
 14  company_name           785723 non-nu

In [6]:
df.head()

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby,Full-time,False,"Texas, United States",2023-06-16 13:44:15,False,False,United States,,,,Boehringer Ingelheim,,
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México,Full-time,False,Mexico,2023-01-14 13:18:07,False,False,Mexico,,,,Hewlett Packard Enterprise,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-10-10 13:14:55,False,False,Germany,,,,ALPHA Augmented Services,"['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"San Antonio, TX",via Diversity.com,Full-time,False,"Texas, United States",2023-07-04 13:01:41,True,False,United States,,,,Southwest Research Institute,"['python', 'c++', 'java', 'matlab', 'aws', 'te...","{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,Data Engineer,Data Engineer- Sr Jobs,"Washington, DC",via Clearance Jobs,Full-time,False,Sudan,2023-08-07 14:29:36,False,False,Sudan,,,,Kristina Daniel,"['bash', 'python', 'oracle', 'aws', 'ansible',...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."


In [44]:
df.tail()

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
785736,Software Engineer,DevOps Engineer,Singapura,melalui Trabajo.org,Pekerjaan tetap,False,Singapore,2023-03-13 06:16:16,False,False,Singapore,,,,CAREERSTAR INTERNATIONAL PTE. LTD.,"['bash', 'python', 'perl', 'linux', 'unix', 'k...","{'os': ['linux', 'unix'], 'other': ['kubernete..."
785737,Data Analyst,CRM Data Analyst,"Bad Rodach, Jerman",melalui BeBee Deutschland,Pekerjaan tetap,False,Germany,2023-03-12 06:18:18,False,False,Germany,,,,HABA FAMILYGROUP,"['sas', 'sas', 'sql', 'excel']","{'analyst_tools': ['sas', 'excel'], 'programmi..."
785738,Business Analyst,Commercial Analyst - Start Now,Malaysia,melalui Ricebowl,Pekerjaan tetap,False,Malaysia,2023-03-12 06:32:36,False,False,Malaysia,,,,Lendlease Corporation,"['powerpoint', 'excel']","{'analyst_tools': ['powerpoint', 'excel']}"
785739,Data Engineer,"Principal Associate, Data Engineer (Remote-Eli...","Newark, New Jersey, Amerika Serikat",melalui Recruit.net,Pekerjaan tetap,False,Sudan,2023-03-12 06:32:15,False,False,Sudan,,,,Capital One,"['python', 'go', 'nosql', 'sql', 'mongo', 'she...","{'cloud': ['aws', 'snowflake', 'azure', 'redsh..."
785740,Software Engineer,AWS System Analyst,India,melalui Trigyn,Pekerjaan tetap,False,India,2023-03-13 06:16:31,False,False,India,,,,Trigyn,"['aws', 'flow']","{'cloud': ['aws'], 'other': ['flow']}"


In [7]:
# Data Cleanup : convert 'job_posted_date' column to pandas datetime
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])

In [8]:
"""
'<M8[ns]' is the internal representation of the datetime64 type in pandas.
M8 stands for "datetime" (where M indicates a "date/time" type, and 8 is the number of bytes used for the dtype)
[ns] indicates that the time resolution is in nanoseconds
"""

df['job_posted_date'].dtype

dtype('<M8[ns]')

### iloc

`df.iloc[]`: Select rows and columns by position.

In [9]:
# Get 1st row
df.iloc[0]

job_title_short                                       Senior Data Engineer
job_title                Senior Clinical Data Engineer / Principal Clin...
job_location                                                 Watertown, CT
job_via                                                    via Work Nearby
job_schedule_type                                                Full-time
job_work_from_home                                                   False
search_location                                       Texas, United States
job_posted_date                                        2023-06-16 13:44:15
job_no_degree_mention                                                False
job_health_insurance                                                 False
job_country                                                  United States
salary_rate                                                           None
salary_year_avg                                                        NaN
salary_hour_avg          

In [10]:
# Get 1 - 7 rows
df.iloc[0:7]

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby,Full-time,False,"Texas, United States",2023-06-16 13:44:15,False,False,United States,,,,Boehringer Ingelheim,,
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México,Full-time,False,Mexico,2023-01-14 13:18:07,False,False,Mexico,,,,Hewlett Packard Enterprise,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-10-10 13:14:55,False,False,Germany,,,,ALPHA Augmented Services,"['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"San Antonio, TX",via Diversity.com,Full-time,False,"Texas, United States",2023-07-04 13:01:41,True,False,United States,,,,Southwest Research Institute,"['python', 'c++', 'java', 'matlab', 'aws', 'te...","{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,Data Engineer,Data Engineer- Sr Jobs,"Washington, DC",via Clearance Jobs,Full-time,False,Sudan,2023-08-07 14:29:36,False,False,Sudan,,,,Kristina Daniel,"['bash', 'python', 'oracle', 'aws', 'ansible',...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."
5,Data Engineer,GCP Data Engineer,Anywhere,via ZipRecruiter,Contractor and Temp work,True,Georgia,2023-11-07 14:01:59,False,False,United States,,,,smart folks inc,"['python', 'sql', 'gcp']","{'cloud': ['gcp'], 'programming': ['python', '..."
6,Senior Data Engineer,Senior Data Engineer - GCP Cloud,"Dearborn, MI",via LinkedIn,Full-time,False,"Florida, United States",2023-03-27 13:18:18,False,False,United States,,,,"Miracle Software Systems, Inc","['sql', 'python', 'java', 'sql server', 'gcp',...","{'cloud': ['gcp', 'bigquery'], 'databases': ['..."


In [11]:
# Get 1 - 6 rows with other method
df.iloc[:6]

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby,Full-time,False,"Texas, United States",2023-06-16 13:44:15,False,False,United States,,,,Boehringer Ingelheim,,
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México,Full-time,False,Mexico,2023-01-14 13:18:07,False,False,Mexico,,,,Hewlett Packard Enterprise,"['r', 'python', 'sql', 'nosql', 'power bi', 't...","{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn,Full-time,False,Germany,2023-10-10 13:14:55,False,False,Germany,,,,ALPHA Augmented Services,"['python', 'sql', 'c#', 'azure', 'airflow', 'd...","{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"San Antonio, TX",via Diversity.com,Full-time,False,"Texas, United States",2023-07-04 13:01:41,True,False,United States,,,,Southwest Research Institute,"['python', 'c++', 'java', 'matlab', 'aws', 'te...","{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,Data Engineer,Data Engineer- Sr Jobs,"Washington, DC",via Clearance Jobs,Full-time,False,Sudan,2023-08-07 14:29:36,False,False,Sudan,,,,Kristina Daniel,"['bash', 'python', 'oracle', 'aws', 'ansible',...","{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."
5,Data Engineer,GCP Data Engineer,Anywhere,via ZipRecruiter,Contractor and Temp work,True,Georgia,2023-11-07 14:01:59,False,False,United States,,,,smart folks inc,"['python', 'sql', 'gcp']","{'cloud': ['gcp'], 'programming': ['python', '..."


In [12]:
# This simply get value from rows 0 and column 1
df.iloc[0,1]

'Senior Clinical Data Engineer / Principal Clinical Data Engineer ...'

In [13]:
# Get rows 2 and 3 with column 0 and 6
df.iloc[[2,3],[0,6]]

Unnamed: 0,job_title_short,search_location
2,Data Engineer,Germany
3,Data Engineer,"Texas, United States"


In [14]:
# Get all rows, but only column 0 to colum 3
df.iloc[:, :4]

Unnamed: 0,job_title_short,job_title,job_location,job_via
0,Senior Data Engineer,Senior Clinical Data Engineer / Principal Clin...,"Watertown, CT",via Work Nearby
1,Data Analyst,Data Analyst,"Guadalajara, Jalisco, Mexico",via BeBee México
2,Data Engineer,"Data Engineer/Scientist/Analyst, Mid or Senior...","Berlin, Germany",via LinkedIn
3,Data Engineer,LEAD ENGINEER - PRINCIPAL ANALYST - PRINCIPAL ...,"San Antonio, TX",via Diversity.com
4,Data Engineer,Data Engineer- Sr Jobs,"Washington, DC",via Clearance Jobs
...,...,...,...,...
785736,Software Engineer,DevOps Engineer,Singapura,melalui Trabajo.org
785737,Data Analyst,CRM Data Analyst,"Bad Rodach, Jerman",melalui BeBee Deutschland
785738,Business Analyst,Commercial Analyst - Start Now,Malaysia,melalui Ricebowl
785739,Data Engineer,"Principal Associate, Data Engineer (Remote-Eli...","Newark, New Jersey, Amerika Serikat",melalui Recruit.net


### Loc

`df.loc[]`: Select rows and columns by position or label.

In [15]:
# If we not feed any label, it similar to df.loc[0] to get rows 0
df.iloc[0]

job_title_short                                       Senior Data Engineer
job_title                Senior Clinical Data Engineer / Principal Clin...
job_location                                                 Watertown, CT
job_via                                                    via Work Nearby
job_schedule_type                                                Full-time
job_work_from_home                                                   False
search_location                                       Texas, United States
job_posted_date                                        2023-06-16 13:44:15
job_no_degree_mention                                                False
job_health_insurance                                                 False
job_country                                                  United States
salary_rate                                                           None
salary_year_avg                                                        NaN
salary_hour_avg          

In [16]:
# get first 7 rows of job_title_short and job_type_skills
df.loc[:7,['job_title_short','job_type_skills']]

Unnamed: 0,job_title_short,job_type_skills
0,Senior Data Engineer,
1,Data Analyst,"{'analyst_tools': ['power bi', 'tableau'], 'pr..."
2,Data Engineer,"{'analyst_tools': ['dax'], 'cloud': ['azure'],..."
3,Data Engineer,"{'cloud': ['aws'], 'libraries': ['tensorflow',..."
4,Data Engineer,"{'cloud': ['oracle', 'aws'], 'other': ['ansibl..."
5,Data Engineer,"{'cloud': ['gcp'], 'programming': ['python', '..."
6,Senior Data Engineer,"{'cloud': ['gcp', 'bigquery'], 'databases': ['..."
7,Data Engineer,"{'cloud': ['gcp', 'azure', 'aws', 'bigquery', ..."


In [17]:
# get rows 40 to 50 of columns job_title_short to job_work_from_home
df.loc[40:55,'job_title_short':'job_work_from_home']

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home
40,Data Scientist,"Data Scientist, Data Strategy and Analytics","Alameda, CA",via Trabajo.org,Full-time,False
41,Data Scientist,Data Scientist I,"Paris, KY",via Jobs By Care,Full-time,False
42,Senior Data Engineer,"Senior Data Engineer(Python)-1563,1637&1633","Chennai, Tamil Nadu, India",via HR Software For Growing Businesses | Fresh...,Full-time,False
43,Senior Data Engineer,Sr SQL Database Engineer with Data Warehouse /...,,via LinkedIn,Contractor,False
44,Data Scientist,Binance Accelerator Program - Data Scientist (...,Singapore,via LinkedIn,Contractor,False
45,Senior Data Analyst,Senior Data Analyst,"Leeds, UK",via My Stateline Jobs,Full-time,False
46,Data Analyst,"Data Analyst (Bangkok Based, relocation provided)","Rome, Metropolitan City of Rome Capital, Italy",via LinkedIn,Full-time,False
47,Senior Data Engineer,Senior Data Engineering,"Kuala Lumpur, Federal Territory of Kuala Lumpu...",via Trabajo.org,Full-time,False
48,Cloud Engineer,Lead Qa Engineer,"Ciudad Nezahualcóyotl, State of Mexico, Mexico",via BeBee,Full-time,False
49,Senior Data Scientist,Data Senior H/F,"Neuilly-Plaisance, France",via BeBee,Full-time,False


## Data Management

In [18]:
# Before cleaning
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785741 non-null  object        
 1   job_title              785740 non-null  object        
 2   job_location           784696 non-null  object        
 3   job_via                785733 non-null  object        
 4   job_schedule_type      773074 non-null  object        
 5   job_work_from_home     785741 non-null  bool          
 6   search_location        785741 non-null  object        
 7   job_posted_date        785741 non-null  datetime64[ns]
 8   job_no_degree_mention  785741 non-null  bool          
 9   job_health_insurance   785741 non-null  bool          
 10  job_country            785692 non-null  object        
 11  salary_rate            33067 non-null   object        
 12  salary_year_avg        22003 non-null   floa

In [19]:
# Get count of NaN Value
df.isna().sum()

job_title_short               0
job_title                     1
job_location               1045
job_via                       8
job_schedule_type         12667
job_work_from_home            0
search_location               0
job_posted_date               0
job_no_degree_mention         0
job_health_insurance          0
job_country                  49
salary_rate              752674
salary_year_avg          763738
salary_hour_avg          775079
company_name                 18
job_skills               117037
job_type_skills          117037
dtype: int64

In [20]:
"""
1. The dropna() method returns a new DataFrame with the missing values removed,
2. how='all': This specifies that only rows where all values are missing (NaN or None) will be dropped. 
If a row contains even one non-missing value, it will be retained.
3. how='any' (default): Any row that contains at least one missing value (NaN or None) will be dropped.
"""
df_cleaned = df.dropna(how='all')
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785741 non-null  object        
 1   job_title              785740 non-null  object        
 2   job_location           784696 non-null  object        
 3   job_via                785733 non-null  object        
 4   job_schedule_type      773074 non-null  object        
 5   job_work_from_home     785741 non-null  bool          
 6   search_location        785741 non-null  object        
 7   job_posted_date        785741 non-null  datetime64[ns]
 8   job_no_degree_mention  785741 non-null  bool          
 9   job_health_insurance   785741 non-null  bool          
 10  job_country            785692 non-null  object        
 11  salary_rate            33067 non-null   object        
 12  salary_year_avg        22003 non-null   floa

In [21]:
# Get count of Nan Value
# Since the dataset not contain full Nan values, the results will be same as before
df_cleaned.isna().sum()

job_title_short               0
job_title                     1
job_location               1045
job_via                       8
job_schedule_type         12667
job_work_from_home            0
search_location               0
job_posted_date               0
job_no_degree_mention         0
job_health_insurance          0
job_country                  49
salary_rate              752674
salary_year_avg          763738
salary_hour_avg          775079
company_name                 18
job_skills               117037
job_type_skills          117037
dtype: int64

### Fill Missing Values

In [22]:
# fillna(): Fill missing values
# fillna(0) : fill missing values with 0
df_filled = df_cleaned.fillna(0)
df_filled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 785741 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785741 non-null  object        
 1   job_title              785741 non-null  object        
 2   job_location           785741 non-null  object        
 3   job_via                785741 non-null  object        
 4   job_schedule_type      785741 non-null  object        
 5   job_work_from_home     785741 non-null  bool          
 6   search_location        785741 non-null  object        
 7   job_posted_date        785741 non-null  datetime64[ns]
 8   job_no_degree_mention  785741 non-null  bool          
 9   job_health_insurance   785741 non-null  bool          
 10  job_country            785741 non-null  object        
 11  salary_rate            785741 non-null  object        
 12  salary_year_avg        785741 non-null  floa

### Drop Duplicate

In [23]:
df_unique = df_filled.drop_duplicates()
df_unique.info()

<class 'pandas.core.frame.DataFrame'>
Index: 785640 entries, 0 to 785740
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   job_title_short        785640 non-null  object        
 1   job_title              785640 non-null  object        
 2   job_location           785640 non-null  object        
 3   job_via                785640 non-null  object        
 4   job_schedule_type      785640 non-null  object        
 5   job_work_from_home     785640 non-null  bool          
 6   search_location        785640 non-null  object        
 7   job_posted_date        785640 non-null  datetime64[ns]
 8   job_no_degree_mention  785640 non-null  bool          
 9   job_health_insurance   785640 non-null  bool          
 10  job_country            785640 non-null  object        
 11  salary_rate            785640 non-null  object        
 12  salary_year_avg        785640 non-null  float64  

In [24]:
print(f'Original data: {len(df)}, Unique data: {len(df_unique)}')
print(f'{len(df) - len(df_unique)} rows removed')

Original data: 785741, Unique data: 785640
101 rows removed


### Fill with Median

In [43]:
df_altered = df.copy()

df_altered.loc[:15,'salary_year_avg']

0    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
11   NaN
12   NaN
13   NaN
14   NaN
15   NaN
Name: salary_year_avg, dtype: float64

In [28]:
# is df and df_altered referenced to the same dataframe?
print(f'Id of df: {id(df)}')
print(f'Id of df_altered: {id(df_altered)}')
print(f'Are df and df_altered same dataframe: {id(df_altered) == id(df)}')

Id of df: 2220628876528
Id of df_altered: 2221531354720
Are df and df_altered same dataframe: False


---
**STEP TO FIND MEDIAN MATHEMATICALLY**

1. Count length of data

2. Sort ascending the data

3.  **For Odd Data:**

   
   - The median is the value at the $\frac{(n + 1)}{2}$th position, where $n$ is the number of data points.

4. **For Even Data:**
   - The median is the average of the values at the $\frac{n}{2} + \frac{(n + 1)}{2} \over{2}$th  positions.

---

**Example:**

Given the data: `6, 7, 8, 3, 5`

1. **Sort the Data in Ascending Order:**
- 3, 5, 6, 7, 8
2. **Count the Number of Data Points:**
- The length of the data is `5`, which is odd.

3. **Find the Median:**
- Since the data is odd, the median is located at the \(\frac{(5 + 1)}{2} = 3\)-rd position.
- The value at the 3rd position is `6`.

**Conclusion:**
- The median of the data `6, 7, 8, 3, 5` is **6**.

In [30]:
# Filling the missing values with the median salary
df_altered['salary_year_avg'] = df_altered['salary_year_avg'].fillna(df_altered['salary_year_avg'].median())
df_altered.loc[:5,'salary_year_avg']

0    115000.0
1    115000.0
2    115000.0
3    115000.0
4    115000.0
5    115000.0
Name: salary_year_avg, dtype: float64

In [41]:
df_altered.tail()

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
785736,Software Engineer,DevOps Engineer,Singapura,melalui Trabajo.org,Pekerjaan tetap,False,Singapore,2023-03-13 06:16:16,False,False,Singapore,,115000.0,,CAREERSTAR INTERNATIONAL PTE. LTD.,"['bash', 'python', 'perl', 'linux', 'unix', 'k...","{'os': ['linux', 'unix'], 'other': ['kubernete..."
785737,Data Analyst,CRM Data Analyst,"Bad Rodach, Jerman",melalui BeBee Deutschland,Pekerjaan tetap,False,Germany,2023-03-12 06:18:18,False,False,Germany,,115000.0,,HABA FAMILYGROUP,"['sas', 'sas', 'sql', 'excel']","{'analyst_tools': ['sas', 'excel'], 'programmi..."
785738,Business Analyst,Commercial Analyst - Start Now,Malaysia,melalui Ricebowl,Pekerjaan tetap,False,Malaysia,2023-03-12 06:32:36,False,False,Malaysia,,115000.0,,Lendlease Corporation,"['powerpoint', 'excel']","{'analyst_tools': ['powerpoint', 'excel']}"
785739,Data Engineer,"Principal Associate, Data Engineer (Remote-Eli...","Newark, New Jersey, Amerika Serikat",melalui Recruit.net,Pekerjaan tetap,False,Sudan,2023-03-12 06:32:15,False,False,Sudan,,115000.0,,Capital One,"['python', 'go', 'nosql', 'sql', 'mongo', 'she...","{'cloud': ['aws', 'snowflake', 'azure', 'redsh..."
785740,Software Engineer,AWS System Analyst,India,melalui Trigyn,Pekerjaan tetap,False,India,2023-03-13 06:16:31,False,False,India,,115000.0,,Trigyn,"['aws', 'flow']","{'cloud': ['aws'], 'other': ['flow']}"


### Random sample

In [31]:
# Get 5 random rows
df_altered.sample(n=5)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
531356,Data Engineer,Data Engineer,"L'Hospitalet de Llobregat, Spain",via BeBee,Full-time,False,Spain,2023-03-10 19:26:42,True,False,Spain,,115000.0,,Setesca,"['jupyter', 'looker']","{'analyst_tools': ['looker'], 'libraries': ['j..."
782392,Data Analyst,Data Analytics,"Santiago, Chile",via BeBee,Full-time,False,Chile,2023-12-12 07:01:36,True,False,Chile,,115000.0,,Santander,,
102571,Data Analyst,IT Architekt​/Business Data Analyst,"Frankfurt, Germany",via Learn4Good.com,Full-time,False,Germany,2023-09-13 00:17:32,True,False,Germany,,115000.0,,Employer details provided on application.,"['python', 'sql', 'azure', 'powerbi']","{'analyst_tools': ['powerbi'], 'cloud': ['azur..."
42619,Data Engineer,Data Engineer,"Bayport, MN",via SonicJobs,Full-time,False,"Texas, United States",2023-02-18 06:08:17,True,False,United States,,115000.0,,Robert Half,,
387023,Business Analyst,Reporting Analyst - Now Hiring,"Evansville, IN",via Snagajob,Full-time,False,"Illinois, United States",2023-07-17 18:03:22,False,False,United States,,115000.0,,Reckitt,"['sap', 'power bi', 'word', 'excel', 'powerpoi...","{'analyst_tools': ['sap', 'power bi', 'word', ..."


In [34]:
# Get 10% random rows
df_altered.sample(frac=0.1, replace=False)

Unnamed: 0,job_title_short,job_title,job_location,job_via,job_schedule_type,job_work_from_home,search_location,job_posted_date,job_no_degree_mention,job_health_insurance,job_country,salary_rate,salary_year_avg,salary_hour_avg,company_name,job_skills,job_type_skills
460514,Data Analyst,Data Analyst Python,"Porto, Portugal",via Empregos Trabajo.org,Full-time,False,Portugal,2023-09-30 14:38:05,True,False,Portugal,,115000.0,,Irium Portugal,"['python', 'sql', 'go', 'power bi']","{'analyst_tools': ['power bi'], 'programming':..."
526171,Data Scientist,"Principal Scientist, Data Science","Rahway, NJ",via The Muse,Full-time,False,"New York, United States",2023-02-07 11:02:27,False,False,United States,,115000.0,,Merck,['go'],{'programming': ['go']}
635212,Software Engineer,PowerBI SME Engineer,"Medellín, Medellin, Antioquia, Colombia",via BeBee,Full-time,False,Colombia,2023-09-06 12:41:53,False,False,Colombia,,115000.0,,Turing,"['gcp', 'aws']","{'cloud': ['gcp', 'aws']}"
49591,Data Scientist,Data Scientist,Chile,via BeBee,Full-time,False,Chile,2023-08-19 06:30:04,False,False,Chile,,115000.0,,MyDNA,,
5109,Data Engineer,Data Engineer,Canada,via BeBee Canada,Full-time,False,Canada,2023-02-08 21:19:50,False,False,Canada,,115000.0,,AbeBooks,"['python', 'scala', 'aws', 'redshift', 'hadoop...","{'analyst_tools': ['word'], 'cloud': ['aws', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302302,Senior Data Scientist,SENIOR DATA SCIENTIST – R&D TIRES (M/F),"Braga, Portugal",via Indeed,Full-time,False,Portugal,2023-11-22 17:24:30,False,False,Portugal,,115000.0,,akapeople,"['python', 'r']","{'programming': ['python', 'r']}"
308492,Data Scientist,Research Assistant Professor in Logistics and ...,Hong Kong,via BeBee 香港,Full-time,False,Hong Kong,2023-02-14 17:39:52,False,False,Hong Kong,,115000.0,,THE HONG KONG POLYTECHNIC UNIVERSITY,['c'],{'programming': ['c']}
230118,Data Engineer,Data Engineering Leader,"São Paulo, State of São Paulo, Brazil",via BeBee,Full-time,False,Brazil,2023-02-14 07:32:33,True,False,Brazil,,115000.0,,GAVB,"['scala', 'sql', 't-sql', 'azure', 'gcp', 'aws...","{'cloud': ['azure', 'gcp', 'aws', 'oracle'], '..."
136794,Senior Data Analyst,Senior Analyst Data Analytics,New Zealand,via BeBee,Full-time,False,New Zealand,2023-11-02 08:34:31,True,False,New Zealand,,115000.0,,ASB Bank,"['python', 'r', 'sql', 'sas', 'sas', 'tableau'...","{'analyst_tools': ['sas', 'tableau', 'power bi..."


## Pivot Table

Syntax: `pivot_table(values='column_to_aggregate', index='row_index', columns='column_index', aggfunc='mean')`

In [35]:
df_altered.groupby('job_title_short').size()

job_title_short
Business Analyst              49063
Cloud Engineer                12331
Data Analyst                 196075
Data Engineer                186241
Data Scientist               172286
Machine Learning Engineer     14080
Senior Data Analyst           29216
Senior Data Engineer          44563
Senior Data Scientist         36957
Software Engineer             44929
dtype: int64

In [36]:
# We can get same result using:
df_altered.pivot_table(index='job_title_short', aggfunc='size')

job_title_short
Business Analyst              49063
Cloud Engineer                12331
Data Analyst                 196075
Data Engineer                186241
Data Scientist               172286
Machine Learning Engineer     14080
Senior Data Analyst           29216
Senior Data Engineer          44563
Senior Data Scientist         36957
Software Engineer             44929
dtype: int64

In [40]:
# Get median salary
df_altered.groupby('job_title_short')['salary_year_avg'].median()

job_title_short
Business Analyst             115000.0
Cloud Engineer               115000.0
Data Analyst                 115000.0
Data Engineer                115000.0
Data Scientist               115000.0
Machine Learning Engineer    115000.0
Senior Data Analyst          115000.0
Senior Data Engineer         115000.0
Senior Data Scientist        115000.0
Software Engineer            115000.0
Name: salary_year_avg, dtype: float64