<h1 align="center"> Analyse des opportunités d'emploi </h1>

<h2 align="center"> Partie 1 - Nettoyage </h2>

## Importation

In [1]:
import pandas as pd
import numpy as np

## Read data

In [2]:
df=pd.read_csv('job.csv', encoding='latin1')
df.head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48K+ *,"Computer Science,Data quality,Genetics,Mathema...",",,,,"
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48K+ *,"Agile,Data management,Finance,Security,,",",,,,"
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,,90K+ *,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,"
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48K+ *,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,"
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108K+,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi..."


## Info about my data

In [3]:
df.tail()

In [4]:
df.sample(10)

In [5]:
df.isnull().sum()

Company                         1
Job Title                       1
Location                        1
Job Type                        1
Experience level              236
Salary                        189
Requirment of the company       0
Facilities                      0
dtype: int64

In [6]:
# where the company name is null
df[df['Company'].isnull()]

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities
797,,,,,,,",,,,,",",,,,"


## drop the null row where the company name is null

In [7]:
df.dropna(subset=['Company'], inplace=True)
df.isnull().sum()

Company                         0
Job Title                       0
Location                        0
Job Type                        0
Experience level              235
Salary                        188
Requirment of the company       0
Facilities                      0
dtype: int64

## Column-wise null values percentage

In [8]:
df.isnull().sum()/df.shape[0]*100

Company                       0.000000
Job Title                     0.000000
Location                      0.000000
Job Type                      0.000000
Experience level              7.350641
Salary                        5.880513
Requirment of the company     0.000000
Facilities                    0.000000
dtype: float64

## Cheacking the data type and non null value present in the dataframe

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3197 entries, 0 to 3197
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Company                     3197 non-null   object
 1   Job Title                   3197 non-null   object
 2   Location                    3197 non-null   object
 3   Job Type                    3197 non-null   object
 4   Experience level            2962 non-null   object
 5   Salary                      3009 non-null   object
 6   Requirment of the company   3197 non-null   object
 7   Facilities                  3197 non-null   object
dtypes: object(8)
memory usage: 224.8+ KB


## Fill the missing values in the Age column with "Not specified"

In [10]:
df["Experience level"].fillna("Not specified", inplace=True)

## Cheking the dublicate value presant in the dataset

In [11]:
df.duplicated().sum()

202

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df[df.duplicated()].sort_values(by=['Job Title']).head(20)

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities


In [14]:
# drop duplicates
df.drop_duplicates(inplace=True)
df[df.duplicated()].sort_values(by=['Job Title']).head()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities


In [15]:
df[df.duplicated(subset=['Company'])].count()

Company                       1889
Job Title                     1889
Location                      1889
Job Type                      1889
Experience level              1889
Salary                        1794
Requirment of the company     1889
Facilities                    1889
dtype: int64

In [16]:
df.drop_duplicates()

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48K+ *,"Computer Science,Data quality,Genetics,Mathema...",",,,,"
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48K+ *,"Agile,Data management,Finance,Security,,",",,,,"
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,Not specified,90K+ *,"Agile,Architecture,AWS,Computer Science,Comput...","Career development,,,,"
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48K+ *,"Engineering,Industrial,Oracle,Power BI,R,R&D",",,,,"
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108K+,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi..."
...,...,...,...,...,...,...,...,...
3190,CCRi,"Application Integration Engineer, Computer Vis...","Chantilly, Virginia, United States",Full Time,Mid-level,113K+,"Agile,Angular,APIs,Architecture,AWS,Azure","401(k) matching,Career development,Flex hours,..."
3191,Publicis Groupe,"Associate Director, Data Science","New York City, United States",Full Time,Mid-level,106K+,"Bayesian,Classification,Clustering,Data analys...","Career development,Health care,,,"
3192,DoorDash,"Senior Software Engineer, Machine Learning - A...","Sunnyvale, CA; San Francisco, CA; New York",Full Time,Senior-level,176K+,"Computer Science,Data analysis,Engineering,Exc...","401(k) matching,Career development,Equity,Insu..."
3193,Western Digital,Data Scientist - New College Graduate,"Biñan, Philippines",Full Time,Entry-level,39K+ *,"APIs,Clustering,Computer Science,Data visualiz...","Career development,,,,"


## Remove ',,,,' character only from cells that contain it in the "Facilities" column

In [17]:
df['Facilities'] = df['Facilities'].str.replace(r',,,,', '')
df['Facilities'] = df['Facilities'].str.replace(r',,,', '')
df['Facilities'] = df['Facilities'].str.replace(r',,', '')

In [18]:
# Remove ',' character only from cells that contain it in the "Salary" column
# df['Facilities'] = df['Facilities'].apply(lambda x: x.replace('', 'Not-Specified') if isinstance(x, str) else x)

In [19]:
def null_facilities(f) :
    if f == '' :
        return 'Not-Specified'
    else :
        return f

df['Facilities'] = df['Facilities'].apply(null_facilities)

In [20]:
df["Job Title"].unique().tolist()

['Clinical Data Analyst',
 'AML/CFT & Data Analyst',
 'Machine Learning Engineer',
 'Application Developer & Data Analyst',
 'Data Engineer Full time (Public Sector) USA',
 'Sr Staff Data Scientist - ATG',
 'Vendor Management and Data Quality Lead',
 'Intern (Business Intelligence Service Support)',
 'Summer 2023 Data Engineering Intern',
 'Principal Cloud Data Engineer (Prisma Access)',
 'Data Scientist (TE-CRG-GLO-2023-19-GRAP)',
 'Data Analyst - Revenue Optimizer',
 'Graduate Power BI Developer',
 'SAP Consultant - Product Data Management',
 'PreMaster Programm - Data Analytics and Visualization',
 'Staff Data Scientist - ATG',
 'Senior Data Analyst - Sales',
 'BI Analyst',
 'Data Scientist | Insights (f/m/d) - GER, UK, NL, PL',
 'Senior Data Analyst (Remote within EMEA)',
 'Senior Data Engineer (Evergreen)',
 'Data Management Scrum Master',
 'Rotational Development Program - Artificial Intelligence and Machine Learning Trainee',
 'Data Engineer Scientist',
 'Data Scientist (Elastic

## remove the '+' symbol is present in the "Salary" column

In [21]:
def check_salary_beyond(salary):
    return 1 if isinstance(salary, str) and '+' in salary else 0

# Add a new column "the salary may go beyond"
df['the salary may go beyond'] = df['Salary'].apply(check_salary_beyond)

In [22]:
# Remove '+' character only from cells that contain it in the "Salary" column
df['Salary'] = df['Salary'].apply(lambda x: x.replace('+', '') if isinstance(x, str) else x)

## remove the ' * ' symbol is present in the "Salary" column

In [23]:
def check_additional_conditions(salary):
    return 1 if isinstance(salary, str) and '*' in salary else 0

# Add a new column "there are additional conditions or footnotes" 
df['there are additional conditions or footnotes'] = df['Salary'].apply(check_additional_conditions)

In [24]:
# Remove '*' character only from cells that contain it in the "Salary" column
df['Salary'] = df['Salary'].apply(lambda x: x.replace('*', '') if isinstance(x, str) else x)

## Remove 'K' character only from cells that contain it in the "Salary" column

In [25]:
# Remove ',' character only from cells that contain it in the "Salary" column
df['Salary'] = df['Salary'].apply(lambda x: x.replace('K', '') if isinstance(x, str) else x)

## Function to convert salary from GBP to USD

In [26]:
def convert_to_usd(salary):
    if isinstance(salary, str) and 'EUR' in salary:
        salary_value = float(salary.replace('EUR', '').replace(',', '').strip())
        return salary_value * 1.1
    return salary

df['Salary'] = df['Salary'].apply(convert_to_usd)

## Function to convert salary from GBP to USD

In [27]:
def convert_to_usd(salary):
    if isinstance(salary, str) and 'GBP' in salary:
        salary_value = float(salary.replace('GBP', '').replace(',', '').strip())
        return salary_value * 1.3
    return salary

df['Salary'] = df['Salary'].apply(convert_to_usd)

## multiply salary by 1000 because we removed 'K'

In [28]:
def multiply_1000(salary):
    if isinstance(salary, str):
        return float(salary.replace('K', '')) * 1000
    return salary

df['Salary'] = df['Salary'].apply(multiply_1000)

In [29]:
df.isnull().sum()

Company                                           0
Job Title                                         0
Location                                          0
Job Type                                          0
Experience level                                  0
Salary                                          171
Requirment of the company                         0
Facilities                                        0
the salary may go beyond                          0
there are additional conditions or footnotes      0
dtype: int64

## replace the null values in the Salary column with 0

In [30]:
df.loc[df['Job Type'].str.contains('Internship', case=False, na=False), 'Salary'] = 0

In [31]:
def check_negociable(salary):
    return 1 if pd.isnull(salary) else 0

# Add a new column "Negociable" and populate it based on the check
df['Negociable'] = df['Salary'].apply(check_negociable)

In [32]:
# # replace the null values in the Salary column with 0
# df['Salary'].fillna('Negociable', inplace=True)

In [33]:
df.loc[df["Job Type"]=='Internship']

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,the salary may go beyond,there are additional conditions or footnotes,Negociable
7,NielsenIQ,Intern (Business Intelligence Service Support),"Bangkok, Thailand",Internship,Entry-level,0.0,"Business Intelligence,Excel,Genetics,,,",Not-Specified,0,0,0
8,Western Digital,Summer 2023 Data Engineering Intern,"San Jose, CA, United States",Internship,Entry-level,0.0,"Big Data,Computer Science,Engineering,Machine ...","Career development,Competitive pay,Equity,Flex...",0,0,0
170,Wallbox,Analytics Engineer Intern,"Barcelona, Catalonia, Spain",Internship,Entry-level,0.0,"Airflow,BigQuery,CI/CD,Databricks,Data quality...","Career development,Flex hours,Team events",0,0,0
213,Bosch Group,Data Management Internship,"Beograd, Serbia",Internship,Entry-level,0.0,"Computer Science,Data management,Engineering,E...",Flex hours,0,0,0
217,Wallbox,Data Analyst Intern,"Barcelona, Catalonia, Spain",Internship,Entry-level,0.0,"Economics,Engineering,Machine Learning,Mathema...","Career development,Flex hours,Team events",0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
2884,Plantix,Data Analyst Intern,"Indore, India",Internship,Entry-level,0.0,"Computer Science,Data analysis,Data Analytics,...",Not-Specified,0,0,0
3073,Definitive Logic,Data Science Intern,"Arlington, VA",Internship,Entry-level,0.0,"Big Data,Consulting,Consulting firm,Data analy...","Career development,Competitive pay,Flex hours,...",0,0,0
3107,Barbaricum,AI Intern (ChatGPT Specialist),Remote,Internship,Entry-level,0.0,"APIs,ChatGPT,Engineering,GPT,GPT-3,GPT-4",Career development,0,0,0
3134,Junglee Games,ETL and Data Warehouse Testing Intern,"Bengaluru, Karnataka, India",Internship,Entry-level,0.0,"Computer Science,Data quality,Data warehouse,E...",Not-Specified,0,0,0


In [34]:
df.isnull().sum()

Company                                          0
Job Title                                        0
Location                                         0
Job Type                                         0
Experience level                                 0
Salary                                          99
Requirment of the company                        0
Facilities                                       0
the salary may go beyond                         0
there are additional conditions or footnotes     0
Negociable                                       0
dtype: int64

## extract country names from the "Location" column

In [36]:
import pycountry

# Function to check if the word represents a country
def is_country(word):
    return any(country.name.lower() == word.lower() for country in pycountry.countries)

# Function to extract country names from the "Location" column
def extract_country(location):
    if isinstance(location, str):
        words = location.split(',')
        for word in words:
            if is_country(word.strip()):
                return word.strip()
    return None

# Add a new column "country" with the extracted country names
df['country'] = df['Location'].apply(extract_country)

In [37]:
# Split "Location" into "city" and "country" columns
# df[['country']] = df['Location'].str.split(', ', n=1, expand=True)

In [38]:
# Add a new column with the result of the code df["Location"].str.lower().str.contains("remote", na=False)
df['is_remote'] = df["Location"].str.lower().str.contains("remote", na=False).astype(int)

In [39]:
df

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,the salary may go beyond,there are additional conditions or footnotes,Negociable,country,is_remote
0,SGS,Clinical Data Analyst,"Richardson, TX, United States",Full Time,Entry-level,48000.0,"Computer Science,Data quality,Genetics,Mathema...",Not-Specified,1,1,0,United States,0
1,Ocorian,AML/CFT & Data Analyst,"Ebène, Mauritius",Full Time,Entry-level,48000.0,"Agile,Data management,Finance,Security,,",Not-Specified,1,1,0,Mauritius,0
2,Cricut,Machine Learning Engineer,"South Jordan, UT, United States",Full Time,Not specified,90000.0,"Agile,Architecture,AWS,Computer Science,Comput...",Career development,1,1,0,United States,0
3,Bosch Group,Application Developer & Data Analyst,"Nonantola, Italy",Full Time,Entry-level,48000.0,"Engineering,Industrial,Oracle,Power BI,R,R&D",Not-Specified,1,1,0,Italy,0
4,Publicis Groupe,Data Engineer Full time (Public Sector) USA,"Arlington, VA, United States",Full Time,Mid-level,108000.0,"AWS,Azure,Computer Science,Consulting,Dataflow...","Flex hours,Flex vacation,Parental leave,Unlimi...",1,0,0,United States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3190,CCRi,"Application Integration Engineer, Computer Vis...","Chantilly, Virginia, United States",Full Time,Mid-level,113000.0,"Agile,Angular,APIs,Architecture,AWS,Azure","401(k) matching,Career development,Flex hours,...",1,0,0,United States,0
3191,Publicis Groupe,"Associate Director, Data Science","New York City, United States",Full Time,Mid-level,106000.0,"Bayesian,Classification,Clustering,Data analys...","Career development,Health care",1,0,0,United States,0
3192,DoorDash,"Senior Software Engineer, Machine Learning - A...","Sunnyvale, CA; San Francisco, CA; New York",Full Time,Senior-level,176000.0,"Computer Science,Data analysis,Engineering,Exc...","401(k) matching,Career development,Equity,Insu...",1,0,0,,0
3193,Western Digital,Data Scientist - New College Graduate,"Biñan, Philippines",Full Time,Entry-level,39000.0,"APIs,Clustering,Computer Science,Data visualiz...",Career development,1,1,0,Philippines,0


In [40]:
df.loc[df['is_remote'] == True, 'country'] = 'Remote'

In [41]:
df["country"].fillna("Other", inplace=True)

In [42]:
df.isnull().sum()

Company                                          0
Job Title                                        0
Location                                         0
Job Type                                         0
Experience level                                 0
Salary                                          99
Requirment of the company                        0
Facilities                                       0
the salary may go beyond                         0
there are additional conditions or footnotes     0
Negociable                                       0
country                                          0
is_remote                                        0
dtype: int64

## Traduire les "Job Title" en anglais:

In [43]:
from googletrans import Translator

In [44]:
# Create an instance of the Translator class
translator = Translator()

In [45]:
# Define a function to translate a text using the Googletrans library
def translate_text(text):
    try:
        return translator.translate(text, dest='en').text
    except:
        return "Translation Error"

In [46]:
df['Job Title'] = df['Job Title'].apply(translate_text)  

In [None]:
df.isnull().sum()

In [47]:
df[df.duplicated(subset=['Company'])]

Unnamed: 0,Company,Job Title,Location,Job Type,Experience level,Salary,Requirment of the company,Facilities,the salary may go beyond,there are additional conditions or footnotes,Negociable,country,is_remote
11,NielsenIQ,Data Analyst - Revenue Optimizer,"Toronto, ON, Canada",Full Time,Not specified,80000.0,"Business Analytics,Business Intelligence,Data ...","Career development,Startup environment",1,1,0,Canada,0
13,Bosch Group,SAP Consultant - Product Data Management,"Braga, Portugal",Full Time,Senior-level,62000.0,"Data management,Engineering,R,Spark,,",Flex hours,1,1,0,Portugal,0
14,Bosch Group,PreMaster Programm - Data Analytics and Visual...,"Gerlingen, Germany",Full Time,Entry-level,39000.0,"BigQuery,Data Analytics,LLMs,Pandas,Python,",Team events,1,1,0,Germany,0
15,ServiceNow,Staff Data Scientist - ATG,"Kirkland, Washington, United States",Full Time,Senior-level,159000.0,"Computer Science,Deep Learning,Industrial,Mach...","401(k) matching,Career development,Competitive...",1,0,0,United States,0
23,Bosch Group,Rotational Development Program - Artificial In...,"Plymouth, MI, United States",Full Time,Entry-level,44000.0,"Airflow,APIs,Architecture,Azure,Clustering,Com...","Career development,Team events",1,1,0,United States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3190,CCRi,"Application Integration Engineer, Computer Vis...","Chantilly, Virginia, United States",Full Time,Mid-level,113000.0,"Agile,Angular,APIs,Architecture,AWS,Azure","401(k) matching,Career development,Flex hours,...",1,0,0,United States,0
3191,Publicis Groupe,"Associate Director, Data Science","New York City, United States",Full Time,Mid-level,106000.0,"Bayesian,Classification,Clustering,Data analys...","Career development,Health care",1,0,0,United States,0
3192,DoorDash,"Senior Software Engineer, Machine Learning - A...","Sunnyvale, CA; San Francisco, CA; New York",Full Time,Senior-level,176000.0,"Computer Science,Data analysis,Engineering,Exc...","401(k) matching,Career development,Equity,Insu...",1,0,0,Other,0
3193,Western Digital,Data Scientist - New College Graduate,"Biñan, Philippines",Full Time,Entry-level,39000.0,"APIs,Clustering,Computer Science,Data visualiz...",Career development,1,1,0,Philippines,0


In [None]:
#  'Data Science': ['data science', 'science', 'data', 'manager', 'science manager', 'director', 'science intern', 'of', 'director data', 'intern', 'manager data', 'consultant', 'of data', 'lead', 'senior', 'head of', 'head', 'science lead', 'senior manager', 'scientist data'],
#         'Artificial Intelligence': ['artificial intelligence', 'artificial', 'intelligence', 'director artificial', 'intelligence machine', 'and machine', 'intelligence and', 'machine learning', 'machine', 'learning', 'director', 'lead', 'lead kpmg', 'kpmg', 'kpmg futures', 'futures', 'intelligence lead', 'and', 'engineer', 'hardware architect'],
#         'Big Data': ['big data', 'big', 'engineer', 'data', 'data engineer', 'senior big', 'engineer big', 'senior', 'architect big', 'sr big', 'hadoop', 'devops', 'solutions', 'sr', 'architect', 'software engineer', 'solutions architect', 'specialist', 'software', 'data machine']
  

In [48]:
def detect_category_with_spacy(job_title):
    # Define your predefined categories and their corresponding keywords/entities
    categories = {
        'AI' : ['ai','machine', 'kpmg futures', 'futures', 'intelligence lead', 'hardware architect', 'learning','director artificial', 'intelligence machine', 'and machine' ,'artificial','Neural Nets','Deep Learning','ml','Machine Learning','Artificial Intelligence'],
        'Data Science' : ['BI Analyst','manager','scientist data','science lead','consultant','director data','science intern','science manager','director','data','Data Developer','Data Architect','scientist','scient','science','Data Analyst','Data Engineer','analytics','Data Manager','Business Intelligence'],
        'Big data' : ['big data','big','data machine','bi','hadoop', 'devops', 'solutions', 'sr', 'architect', 'software engineer', 'solutions architect', 'data engineer', 'senior big', 'engineer big', 'senior', 'architect big', 'sr big']
    }
    job_title_lower = job_title.lower()

    # Find the category that matches the job title based on most keywords/entities
    max_count = 0
    for category, keywords in categories.items():
        count = 0
        for keyword in keywords:
            if keyword in job_title_lower:
                count += 1
        if count > max_count:
            max_count = count
            max_category = category
    
    # If no category matches, return 'Other'
    if max_count > 0:
        return max_category
    else:
        return 'Other'

In [50]:
# Detect the category of each job title
df['Job_category'] = df['Job Title'].apply(detect_category_with_spacy)

In [None]:
df.loc[df["country"]=='Other'].sample(20)

In [None]:
# number of jobs in each category
df['Job_category'].value_counts()

In [None]:
# number of jobs in each category
df['country'].value_counts()

In [51]:
df.isnull().sum()

Company                                          0
Job Title                                        0
Location                                         0
Job Type                                         0
Experience level                                 0
Salary                                          99
Requirment of the company                        0
Facilities                                       0
the salary may go beyond                         0
there are additional conditions or footnotes     0
Negociable                                       0
country                                          0
is_remote                                        0
Job_category                                     0
dtype: int64

In [52]:
# Write the data to the CSV file
df.to_csv('job_clean.csv', index=False)

In [None]:
# Save the updated DataFrame to an Excel file
df.to_excel('job_cleaned.xlsx', index=False)