In [1]:
!pip3 install geopandas
!pip3 install geopy
!pip3 install CurrencyConverter 



In [2]:
# Imports

# Geo Tools
import geopandas as gpd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

# NLP
import re
from functools import partial
from transformers import pipeline
from currency_converter import CurrencyConverter

# Data Manipulation & Visualisation
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statistics
import os

# Own modules
from dataExtraction import extract

## Import Data

In [3]:
df = pd.read_csv('data.csv')
print(df.shape)
df.head(10)

(11311, 8)


Unnamed: 0,title,company_name,location,via,description,job_highlights,detected_extensions,job_id
0,Junior Data Scientist,ING,Amsterdam,ING Careers,As the data driven mindset is more and more em...,['As the data driven mindset is more and more ...,"{'posted_at': '6 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJKdW5pb3IgRGF0YSBTY2llbnRpc3...
1,"JUNIOR DATA SCIENTIST - Dubai, UAE",Cobblestone Energy,Utrecht,LinkedIn,"Location: Dubai, UAE (We provide visa sponsors...","[""Location: Dubai, UAE (We provide visa sponso...","{'posted_at': '4 hours ago', 'schedule_type': ...",eyJqb2JfdGl0bGUiOiJKVU5JT1IgREFUQSBTQ0lFTlRJU1...
2,Data Scientist Mobiliteit,TNO,The Hague,TNO,Halen we in Nederland de klimaatdoelen op het ...,['Halen we in Nederland de klimaatdoelen op he...,"{'posted_at': '5 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCBNb2JpbG...
3,Data Scientist Real Estate for Catella Investm...,Catella Investment Management Benelux,Maastricht,Limburgvac,As a Data Scientist in the Research & Investme...,['As a Data Scientist in the Research & Invest...,"{'posted_at': '20 hours ago', 'schedule_type':...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCBSZWFsIE...
4,Data Scientist,Effectory,Amsterdam,Effectory Jobs,Improving the working lives of millions of peo...,['Improving the working lives of millions of p...,{'schedule_type': 'Full–time'},eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCIsImh0aW...
5,Data Scientist,Adyen,Amsterdam,Nationale Vacaturebank,Functieomschrijving Data Analytics Amsterdam T...,"[""Functieomschrijving\n\nData Analytics Amster...","{'posted_at': '17 hours ago', 'schedule_type':...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCIsImh0aW...
6,Data Scientist bij Transavia,Transavia,Schiphol,Vacatures - Transa,Wij zoeken jou als Data Scientist Voor ons Str...,['Wij zoeken jou als Data Scientist\n\nVoor on...,{'schedule_type': 'Full–time'},eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCBiaWogVH...
7,Data Science Lead - Amsterdam,Bynder,Amsterdam,Careers At Bynder,Bynder goes far beyond managing digital assets...,['Bynder goes far beyond managing digital asse...,"{'posted_at': '2 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVuY2UgTGVhZCAtIE...
8,"LEAD DATA SCIENTIST - Dubai, UAE",Cobblestone Energy,Rotterdam,LinkedIn,Employment type: Full-time & Permanent Reports...,"[""Employment type: Full-time & Permanent\n\nRe...","{'posted_at': '1 day ago', 'schedule_type': 'F...",eyJqb2JfdGl0bGUiOiJMRUFEIERBVEEgU0NJRU5USVNUIC...
9,Data Science and Artificial Intelligence Fello...,Wageningen University & Research,Wageningen,AcademicTransfer,Are you a computer scientist with a PhD degree...,['Are you a computer scientist with a PhD degr...,{'posted_at': '2 days ago'},eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVuY2UgYW5kIEFydG...


Basic string clean-up, by removing new line characters, unneccessary quotation marks, and brackets that may effect further processing.

In [4]:
df['job_highlights'] = df['job_highlights'].replace(r'\n\n|\n•|\n|\\n|\\n•|•', '', regex=True)  # Remove new line char
df['job_highlights'] = df['job_highlights'].str.slice(2, -2)  # Remove [] and additional "" marks

## Salary Extraction

The resulting dataset, does not have a specified field for salary information, as this is not legally required information globally.

Salary information, can be found spread amongst three columns: *title*, *description*, and *highlights*. 

There exist three possibilities for each listing, based on the salary informations search space:

1. The job listing will contain no salary information.
2. The job listing will contain an exact salary specification.
3. The job listing will contain a salary range. 

**Step 1: NLP salary extraction** \
Due to the unstructured data format, Natural Language Processing is the optimal tool to perform salary extraction from the defined text search space. 

[BERT](https://huggingface.co/docs/transformers/model_doc/bert) (Bidirectional Encoder Representations from Transformers) is a pre-trained Natural Language model trained on Masked Lanuage Modelling (predict missing words based on context of the surrounding words) and Next Sentence Prediction (NSP). BERT offers a [Question-Answer model](https://huggingface.co/distilbert-base-cased-distilled-squad#technical-specifications), whereby an answer to a posed question is extracted from provided text. This was utilized for the task of salary extraction, due to its contextual capabilities.

The Question posed is "What is the salary or salary-range for the job". The provided text from which to extrapolate the data was the combined three columns. The returned answer is accepted if a score of 70% accuracy is provided. If no salary is determined, the salary is determined "Not available".

**Note: The below cell takes approximately 2 hours to run to completion. Instead read in the already extracted salary results (cell as the start of *Country Extraction*)**

In [7]:
def extract_salary():
    qa_model = pipeline("question-answering", model='distilbert-base-cased-distilled-squad') # Define the model 
    question = "What is the salary or salary range for the job?"  # Define the quesiton to be answered
    # Map question-answer model accross dataframe
    df['salary'] = df.apply(lambda x: "Not available" if qa_model(question=question, 
                                                                           context=(x['job_highlights'] + 
                                                                                    x['title'] + 
                                                                                    x['description']))['score'] < 0.3
                                                      else qa_model(question=question, 
                                                                              context=(x['job_highlights'] + 
                                                                                       x['title'] + 
                                                                                       x['description']))['answer'], axis=1)
    return df
    
df = extract_salary()

Remove any dots or commas from the extracted salary string. 

**Note:** The returned salary, is still within an unstructured string format.

In [9]:
df['salary'] = df['salary'].str.replace(',', '', regex=True)  # Replace comma point in numerical values
df['salary'] = df['salary'].str.replace('.', '', regex=True)  # Replace dot point in numerical values

**Step 2: Isolate salary or generate from range**
For each extracted salary from the listing, the previously discussed three possibilities are handled. 

1. No salary information is provided. No further steps for this listing.
2. The exactly numerical value is detected (single numerical value) through regex matching, and extracted into the *salary* column.
3. A salary range is detected (two numerical values) through regex matching. The salary range is extracted into the *salary_range* column, whereas the median of the upper and lower bound is calculated and placed  in the *salary* column as an approximation.

In [10]:
def find_specified_salary(x):    
    values = re.findall(r'\d+', str(x))  # Identify numerical values
    salary = 'Not available'
    salary_range = 'Not available'
    if len(values) == 1:  # Single numerical value, not a range
        salary = int(values[0]) 
        
    elif len(values) == 2:  # Two numerical values indicate a range
        min_salary = int(values[0])  # Min of salary range
        max_salary = int(values[1])  # Max of salary range
        salary = statistics.median([min_salary, max_salary])  # Calculate range median
        salary_range = str(min_salary) + "-" + str(max_salary)  # Format range
        
    return pd.Series([salary, salary_range])


df[['salary', 'salary_range']] = df['salary'].apply(find_specified_salary)

In [11]:
df.to_csv('data_salaries.csv')

In [12]:
test = df[df.salary != 'Not available']
print("Number of salaries extracted: ", len(test))

Number of salaries extracted:  2157


## Country Extraction
Each job listing is accompanied by a specified location, in the form of the city in which the company is listed.

Geographic information is an essential data feature, allowing for the undertsnading of the global distribution of scraped jobs, a stable indication of currency, and coordinates.  

A geo-location library 'GeoPy' was utilized in order to generate API calls to Nominatim (Backend of www.openstreetmap.com), in order to geolocate the provided location, returning country, city, and coordinates. 

Nominatim enforces a API limit, of approximately 2500 requests per day. The accumulated datset size is approximately 8000 job listings.

In [5]:
df = pd.read_csv('data_salaries.csv')

In [6]:
df['location'].value_counts()

United States             622
New York, NY, USA         561
San Francisco, CA, USA    317
Anywhere                  249
Chicago, IL, USA          224
                         ... 
Würzburg                    1
Hagen                       1
Jüchen                      1
Künzelsau                   1
Titusville, PA, USA         1
Name: location, Length: 928, dtype: int64

There exist approximately 920 unique locations. 

Due the high quantity of location repetition, caching can be utilized to store a location's features once collected, allowing for re-use. This reduces the API requests to the number of unique locations, a substantial reduction. 

### Location Cache
The below code generates a cache based on the number of unique locations within the scraped job listings. 

**Note: This should only be run of additional jobs are scraped. The GeoPy API has a daily rate limit of 2500 requests per day.**

In [25]:
locations = df.location.unique()  # Determine unique locations
locations_df = pd.DataFrame(data=locations, columns=['location'])  # Unique locations dataframe

geolocator = Nominatim(user_agent="data_madness")  # Initialize the geolocator to use Nominatim
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)  # Enforce a request delay to respect API

In [26]:
def locator(x):
    if x['location'] == 'Anywhere':   # Job location listed as "Anywhere"
        x['address'] = "Not available"
        x['latitude'] = "Not available"
        x['longitude'] = "Not available"
        return x
        
    location = geolocator.geocode(x['location'], # Perform Geolocation search
                                  language='en', 
                                  exactly_one=True)  
    if location is not None:  # A location was found.
        x['address'] = location.address
        x['latitude'] = location.latitude
        x['longitude'] = location.longitude
    else:  # No location was found
        x['address'] = "Not available"
        x['latitude'] = "Not available"
        x['longitude'] = "Not available"
    return x

locations_df = locations_df.apply(locator, axis=1)
locations_df.head(10)

Unnamed: 0,location,address,latitude,longitude
0,Amsterdam,"Amsterdam, North Holland, Netherlands",52.37308,4.892453
1,Utrecht,"Utrecht, Netherlands",52.080986,5.127684
2,The Hague,"The Hague, South Holland, Netherlands",52.079984,4.311346
3,Maastricht,"Maastricht, Limburg, Netherlands",50.857985,5.696988
4,Schiphol,"Schiphol, Haarlemmermeer, North Holland, Nethe...",52.308039,4.762198
5,Rotterdam,"Rotterdam, South Holland, Netherlands",51.924442,4.47775
6,Wageningen,"Wageningen, Gelderland, Netherlands",51.968582,5.668298
7,Groningen,"Groningen, Netherlands",53.219065,6.568008
8,Netherlands,Netherlands,52.243498,5.634323
9,Dordrecht,"Dordrecht, South Holland, Netherlands",51.795881,4.677935


In [27]:
locations_df.to_csv('locations_cache.csv', index=False)

### Country Extraction
Extract the country, latitude, and longitude from the locations cache, and apply it to the dataframe in order to generate the required information for each listing.

In [7]:
locations_df = pd.read_csv('locations_cache.csv')  # Read in location cache from file
locations_df.set_index('location', inplace=True)  # Set location as index

In [8]:
countries = ['United Kingdom', 'France', 'Italy', 'Spain', 'Belgium',
            'Netherlands', 'Germany', 'Denmark', 'Switzerland', 'Austria',
            'Czech Republic', 'Canada', 'United States', 'South Africa',
            'Singapore', 'United Arab Emirates', 'Brazil', 'India', 
            'Portugal', 'Kenya', 'Argentina', 'Egypt', 'Suriname',
             'Mali', 'Malawi', 'Afghanistan', 'Philippines', 
             'Serbia', 'Not available']  # Country search space based on scraping locations


def find_country(address):  # The geopy API call returns a location string. Extract country from string
    regex_pattern = '|'.join([f'{country}' for country in countries])  # Create regex pattern of countries
    country = re.search(regex_pattern, address)  # Identify country in string
    if country == None: # New country determined. Print result so it can be added to countries
        print(address)
    
    return country.group()  # Retrun country name

def identify_country(x):  # Method extracts all geo-information from cache
    location = x['location']
    if location == 'Anywhere': # Take into account remote work
        x['latitude'] = 'Not available'
        x['longitude'] = 'Not available'
        x['country'] = 'Anywhere'
        return x
    
    x['latitude'] = locations_df.loc[location, 'latitude']
    x['longitude'] = locations_df.loc[location, 'longitude']
    x['country'] = find_country(locations_df.loc[location, 'address'])
    return x

df = df.apply(identify_country, axis=1)

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,company_name,location,via,description,job_highlights,detected_extensions,job_id,salary,salary_range,latitude,longitude,country
0,0,Junior Data Scientist,ING,Amsterdam,ING Careers,As the data driven mindset is more and more em...,As the data driven mindset is more and more em...,"{'posted_at': '6 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJKdW5pb3IgRGF0YSBTY2llbnRpc3...,Not available,Not available,52.3730796,4.8924534,Netherlands
1,1,"JUNIOR DATA SCIENTIST - Dubai, UAE",Cobblestone Energy,Utrecht,LinkedIn,"Location: Dubai, UAE (We provide visa sponsors...","Location: Dubai, UAE (We provide visa sponsors...","{'posted_at': '4 hours ago', 'schedule_type': ...",eyJqb2JfdGl0bGUiOiJKVU5JT1IgREFUQSBTQ0lFTlRJU1...,60000,Not available,52.080985600000005,5.12768396945229,Netherlands
2,2,Data Scientist Mobiliteit,TNO,The Hague,TNO,Halen we in Nederland de klimaatdoelen op het ...,Halen we in Nederland de klimaatdoelen op het ...,"{'posted_at': '5 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCBNb2JpbG...,Not available,Not available,52.0799838,4.3113461,Netherlands
3,3,Data Scientist Real Estate for Catella Investm...,Catella Investment Management Benelux,Maastricht,Limburgvac,As a Data Scientist in the Research & Investme...,As a Data Scientist in the Research & Investme...,"{'posted_at': '20 hours ago', 'schedule_type':...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCBSZWFsIE...,Not available,Not available,50.85798545,5.6969881818221095,Netherlands
4,4,Data Scientist,Effectory,Amsterdam,Effectory Jobs,Improving the working lives of millions of peo...,Improving the working lives of millions of peo...,{'schedule_type': 'Full–time'},eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCIsImh0aW...,68750.0,47500-90000,52.3730796,4.8924534,Netherlands


In [10]:
df['country'].value_counts()

United States     8536
Netherlands        437
Germany            302
Italy              277
France             254
Spain              250
Anywhere           249
United Kingdom     230
Belgium            198
Switzerland        151
Austria            150
Not available      117
Denmark            109
South Africa        19
India               10
Philippines          7
Afghanistan          5
Suriname             3
Argentina            3
Serbia               2
Mali                 1
Portugal             1
Name: country, dtype: int64

There exists a total of 22 countries in which job postings are positioned, with the majority occuring in the United States. 

## Currency Extraction
Extract the currency based on the extracted country, based on the below dictionary of ISO currencies. Libraries did not offer an expediant solution, hence the hard-coding of values below. Note the ISO format is chosen to enable currency conversion at a later date.

In [11]:
country_currency_mapping = {  # Mapping of country to currency (ISO format)
    'United Kingdom': 'GBP', 
    'France': 'EUR',
    'Italy': 'EUR', 
    'Spain': 'EUR', 
    'Belgium': 'EUR',
    'Netherlands': 'EUR', 
    'Germany': 'EUR', 
    'Denmark': 'DKK', 
    'Switzerland': 'CHF', 
    'Austria': 'EUR',
    'Czech Republic': 'CZK', 
    'Canada': 'CAD', 
    'United States': 'USD', 
    'South Africa': 'ZAR',
    'Singapore': 'SGD', 
    'United Arab Emirates': 'AED', 
    'Brazil': 'BRL', 
    'India': 'INR', 
    'Portugal': 'EUR', 
    'Kenya': 'KES', 
    'Argentina': 'ARS', 
    'Egypt': 'EGP', 
    'Suriname': 'SRD',
    'Mali': 'XOF', 
    'Malawi': 'MWK', 
    'Afghanistan': 'AFN', 
    'Philippines': 'PHP', 
    'Serbia': 'RSD', 
    'Anywhere': 'USD', # Assumed USD due to the high quantity of USA listings
    'Not available': 'Not available'  # No location was determined for job listing
}

def extract_currency(x):
    x['currency'] = country_currency_mapping[x['country']]  # Populate each listings location currency
    return x
    
    
df = df.apply(extract_currency, axis=1)

In [12]:
df.currency.value_counts()

USD              8785
EUR              1869
GBP               230
CHF               151
Not available     117
DKK               109
ZAR                19
INR                10
PHP                 7
AFN                 5
SRD                 3
ARS                 3
RSD                 2
XOF                 1
Name: currency, dtype: int64

There is expected to be a degree of error, such that Jobs may be posted within a country, but offered from international origins, making the country of origin to currency mapping incorrect. However this occurance is a minority, and if extreme outlier occur due to this, the values can be transformed or removed.

In [13]:
df.to_csv('data_countries.csv')

## Currency Conversion
Convert from a multiplitude of currencies into **Euros** to enable fair comparable values, when using salary as a feature during modelling or other processes.

A library CurrencyConverter, enables currency conversion. It requires currency type to be specified in ISO format.

In [14]:
def convert_currencies(x):
    if x['salary'] == 'Not available' or x['currency'] == 'Not available':
        x['euro_value'] = 'Not available' 
        return x
    
    conv = CurrencyConverter()
    x['euro_value'] = conv.convert(x['salary'], x['currency'], 'EUR')  # Perform currency conversion
    return x


df = df.apply(convert_currencies, axis=1)

In [15]:
df.tail()

Unnamed: 0.1,Unnamed: 0,title,company_name,location,via,description,job_highlights,detected_extensions,job_id,salary,salary_range,latitude,longitude,country,currency,euro_value
11306,11306,Data Scientist with Security Clearance,ClearanceJobs,"Bethesda, MD, USA",LinkedIn,Data Scientist Job Category: Science Time Type...,"Must be a U.S. citizen', 'Must currently hold ...","{'posted_at': '26 days ago', 'schedule_type': ...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCB3aXRoIF...,Not available,Not available,38.98127255,-77.12335871396549,United States,USD,Not available
11307,11307,"Data Scientist - Tiktok Ads, Ads Measurement",TikTok,"Mountain View, CA, USA",LinkedIn,Responsibilities TikTok is the leading destina...,3+ years industry experience and advanced degr...,"{'posted_at': '7 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCAtIFRpa3...,224500.0,153000-296000,37.3893889,-122.0832101,United States,USD,207601.257629
11308,11308,RWE Clinical Data Scientist - Sales Solutions,"Tempus Labs, Inc.","New York, NY, USA",LinkedIn,Passionate about precision medicine and advanc...,Education to Masters or PhD level combining qu...,"{'posted_at': '9 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJSV0UgQ2xpbmljYWwgRGF0YSBTY2...,Not available,Not available,40.7127281,-74.0060152,United States,USD,Not available
11309,11309,Healthcare Data Scientist,IQuest Solutions Corporation,"Irving, TX, USA",LinkedIn,Responsibilities Provide analysis to identify...,"5 or more years of relevant work experience', ...",{'schedule_type': 'Full–time'},eyJqb2JfdGl0bGUiOiJIZWFsdGhjYXJlIERhdGEgU2NpZW...,Not available,Not available,32.8295183,-96.9442177,United States,USD,Not available
11310,11310,Geospatial Data Scientist,Climate,"Chicago, IL, USA",LinkedIn,"At Climate, our mission is to use technologies...","MS in Statistics, Data Science, Math, Physics,...","{'posted_at': '11 days ago', 'schedule_type': ...",eyJqb2JfdGl0bGUiOiJHZW9zcGF0aWFsIERhdGEgU2NpZW...,Not available,Not available,41.8755616,-87.6244212,United States,USD,Not available


## Skills Extraction

Identify skills, and tools listed within the job listings in order to determine in-demand skillsets or tools within the Data Science profession. 

A collection of skills (skills_list.txt) associated with Data Science was generated through research, to serve as a checklist, against which job listings were analyzed. 

The result is a One-Hot-Encoded checklist per listing, indicating the skill or tool required by the job. 

In [17]:
# Add empty columns
skills_list = []
with open("notes_and_prototypes/skills_list.txt") as file:
    while (line := file.readline().rstrip()):
        if line == "R":
            skills_list.append(" R ")
        else:
            skills_list.append(line)


# Replace blank spaces with underscores for dataframe columns
columns_list = []
for element in skills_list:
    columns_list.append(element.replace(" ", "_"))

for element in columns_list:
    df[element] = 0

In [18]:
# Find keywords and set columns accordingly

for i, row in df.iterrows():
    for keyword in skills_list:
        if keyword.lower() in df.at[i, "description"].lower():
            col = keyword.replace(" ", "_")
            df.at[i, col] = 1
# Print first results to check
df.head(10)

Unnamed: 0.1,Unnamed: 0,title,company_name,location,via,description,job_highlights,detected_extensions,job_id,salary,...,Tensorflow,Teamwork,Time_series_analysis,Unsupervised_learning,UX_design,Visualization,Web_development,XML,YAML,YARN
0,0,Junior Data Scientist,ING,Amsterdam,ING Careers,As the data driven mindset is more and more em...,As the data driven mindset is more and more em...,"{'posted_at': '6 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJKdW5pb3IgRGF0YSBTY2llbnRpc3...,Not available,...,0,0,0,0,0,1,0,0,0,0
1,1,"JUNIOR DATA SCIENTIST - Dubai, UAE",Cobblestone Energy,Utrecht,LinkedIn,"Location: Dubai, UAE (We provide visa sponsors...","Location: Dubai, UAE (We provide visa sponsors...","{'posted_at': '4 hours ago', 'schedule_type': ...",eyJqb2JfdGl0bGUiOiJKVU5JT1IgREFUQSBTQ0lFTlRJU1...,60000,...,0,0,0,0,0,1,0,0,0,0
2,2,Data Scientist Mobiliteit,TNO,The Hague,TNO,Halen we in Nederland de klimaatdoelen op het ...,Halen we in Nederland de klimaatdoelen op het ...,"{'posted_at': '5 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCBNb2JpbG...,Not available,...,0,0,0,0,0,0,0,0,0,0
3,3,Data Scientist Real Estate for Catella Investm...,Catella Investment Management Benelux,Maastricht,Limburgvac,As a Data Scientist in the Research & Investme...,As a Data Scientist in the Research & Investme...,"{'posted_at': '20 hours ago', 'schedule_type':...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCBSZWFsIE...,Not available,...,0,0,0,0,0,0,0,0,0,0
4,4,Data Scientist,Effectory,Amsterdam,Effectory Jobs,Improving the working lives of millions of peo...,Improving the working lives of millions of peo...,{'schedule_type': 'Full–time'},eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCIsImh0aW...,68750.0,...,0,0,0,0,0,1,0,0,0,0
5,5,Data Scientist,Adyen,Amsterdam,Nationale Vacaturebank,Functieomschrijving Data Analytics Amsterdam T...,FunctieomschrijvingData Analytics AmsterdamThi...,"{'posted_at': '17 hours ago', 'schedule_type':...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCIsImh0aW...,Not available,...,1,0,0,0,0,0,0,0,0,0
6,6,Data Scientist bij Transavia,Transavia,Schiphol,Vacatures - Transa,Wij zoeken jou als Data Scientist Voor ons Str...,Wij zoeken jou als Data ScientistVoor ons Stra...,{'schedule_type': 'Full–time'},eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVudGlzdCBiaWogVH...,Not available,...,1,0,0,0,0,1,0,0,0,0
7,7,Data Science Lead - Amsterdam,Bynder,Amsterdam,Careers At Bynder,Bynder goes far beyond managing digital assets...,Bynder goes far beyond managing digital assets...,"{'posted_at': '2 days ago', 'schedule_type': '...",eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVuY2UgTGVhZCAtIE...,Not available,...,0,0,0,0,0,1,0,0,0,0
8,8,"LEAD DATA SCIENTIST - Dubai, UAE",Cobblestone Energy,Rotterdam,LinkedIn,Employment type: Full-time & Permanent Reports...,Employment type: Full-time & PermanentReports ...,"{'posted_at': '1 day ago', 'schedule_type': 'F...",eyJqb2JfdGl0bGUiOiJMRUFEIERBVEEgU0NJRU5USVNUIC...,150000,...,0,0,0,0,0,1,0,0,0,0
9,9,Data Science and Artificial Intelligence Fello...,Wageningen University & Research,Wageningen,AcademicTransfer,Are you a computer scientist with a PhD degree...,Are you a computer scientist with a PhD degree...,{'posted_at': '2 days ago'},eyJqb2JfdGl0bGUiOiJEYXRhIFNjaWVuY2UgYW5kIEFydG...,Not available,...,0,0,0,0,0,0,0,0,0,0


## Export Data Frame to new CSV
With the new csv, all necessary information is extracted and will be the new basis to work with.

In [19]:
df.to_csv('data_clean.csv')