# Data engineer skills Project

The objective of this project is to highlight the most in-demand skills for data engineer roles in France.

How it is done : 
1. Extract job postings for data engineer roles from a personal database.
2. Format description and look for most demanded skills.
3. Summarize the results visually.

# Imports

In [58]:
import os
import pandas as pd
import logging
from datetime import datetime
from sqlalchemy import create_engine, Table, MetaData, text
import re
import regex as re

## Configuration

In [59]:
# Logging
LOG_DIR = "logs"
os.makedirs(LOG_DIR, exist_ok=True)
logging.basicConfig(
    filename=os.path.join(LOG_DIR, f"pipeline_{datetime.now().strftime('%Y-%m-%d')}.log"),
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

In [60]:
##################  VARIABLES  ##################
# Param Database PostgreSQL
DB_NAME = os.environ.get("DB_NAME", "jobsdb")
DB_USER = os.environ.get("DB_USER","jobsuser")
DB_PASS = os.environ.get("DB_PASS", "jobspass")
DB_HOST = os.environ.get("DB_HOST", "localhost")
DB_PORT = os.environ.get("DB_PORT","5432")

# Nom table
DB_TABLE_NAME = "offres_table"

## Extract data from database

In [61]:
### Connect to database
def export_from_database(engine, table_name):
    logging.info("Connect to database.")
    try:  
        query = text(f"""
            SELECT     
                   id, 
                   source,
                   recherche,
                   titre,
                   description,
                   departement,
                   date_publication               
            FROM {table_name}
            WHERE recherche = 'data engineer'
        """)
        
        with engine.connect() as conn:
            result = conn.execute(query)       
            df_extract = pd.DataFrame(result.fetchall(),columns=["id", 
                                                               "source",
                                                               "recherche",
                                                               "titre",
                                                               "description",
                                                               "departement",
                                                               "date_publication"])    
            return df_extract
            
    except requests.RequestException as e:
        logging.error(f"Error database export: {e}")
        return []

In [62]:
# Connexion DB
engine = create_engine(f"postgresql+psycopg2://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

#Extract data from DB
df = export_from_database(engine, DB_TABLE_NAME)
df = df.head()
display(df)

Unnamed: 0,id,source,recherche,titre,description,departement,date_publication
0,202FHJH,France Travail,data engineer,Knowledge Manager IA (H/F),CDI - Consultant Knowledge Manager IA (H/F)\n\...,91,2026-01-06
1,200MWYF,France Travail,data engineer,Ingénieur Gestion des Données Techniques (H/F),Rattaché au Responsable de Groupe Nouveaux Pro...,68,2025-11-19
2,7407914,France Travail,data engineer,Senior Consultant/Manager – Data Architect et/...,Description de l'entrepriseRejoignez un cabine...,92,2026-01-05
3,4882600908,Adzuna,data engineer,Data Engineer (H/F),Missions : Mise en place et la configuration d...,47,2024-10-01
4,8033210,France Travail,data engineer,Chef de projet DATA H/F,Mission principale : piloter la conception et ...,34,2026-01-19


# Clean and normalize job description

In [63]:
def clean_text(text):
    logging.info("Clean and normalize job description.")
    try: 
        # Converts everything to lowercase
        text = text.lower()
    
        # It replaces anything like <tag>...</tag> with a space.
        text = re.sub(r"<.*?>", " ", text)     # HTML
    
        # Removes unwanted characters (but keeps French accents)
        # This line keeps only:
        # - lowercase letters a–z
        # - digits 0–9
        # - + . /
        # - spaces
        # Everything else becomes a space.
        text = re.sub(r"[^\p{L}0-9+./ ]", " ", text)
    
        # Cleans extra spaces
        # - \s+ = multiple spaces
        # - Replaces them with one single space
        # - .strip() removes spaces at the beginning and end
        text = re.sub(r"\s+", " ", text).strip()
        return text
    except requests.RequestException as e:
        logging.error(f"Error cleaning and normalizing process: {e}")
        return []

df["clean_description"] = df["description"].astype(str).apply(clean_text)