In [None]:
!pip install pandas

Dataset : https://www.kaggle.com/datasets/gianinamariapetrascu/gender-pay-gap-europe-2010-2021

In [11]:
import sqlite3
import pandas as pd

## Create and populate database

In [12]:
def create_and_populate_database(csv_file:str, db_name:str):
    """This function reads a CSV file and creates a SQLite database with the data.
    args:    
        csv_file: str: the path to the CSV file
        db_name: str: the name of the SQLite database
    """
    df = pd.read_csv(csv_file)
    countries = df['Country'].unique()
    sectors = [col for col in df.columns if col not in ['Country', 'Year','GDP','Urban_population']]
    conn = sqlite3.connect(db_name)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS Country (
            country_id INTEGER PRIMARY KEY,
            country_name TEXT UNIQUE
        );
    """)
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS Sector (
            sector_id INTEGER PRIMARY KEY,
            sector_name TEXT UNIQUE
        );
    """)
    
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS CountrySectorData (
            id INTEGER PRIMARY KEY,
            country_id INTEGER,
            sector_id INTEGER,
            year INTEGER,
            value REAL,
            FOREIGN KEY (country_id) REFERENCES Country(country_id),
            FOREIGN KEY (sector_id) REFERENCES Sector(sector_id)
        );
    """)
    
    for country in countries:
        cursor.execute("INSERT OR IGNORE INTO Country (country_name) VALUES (?);", (country,))
    
    for sector in sectors:
        cursor.execute("INSERT OR IGNORE INTO Sector (sector_name) VALUES (?);", (sector,))
    
    for _, row in df.iterrows():
        country_name = row['Country']
        year = row['Year']
        cursor.execute("SELECT country_id FROM Country WHERE country_name = ?;", (country_name,))
        country_id = cursor.fetchone()[0]
        
        for sector in sectors:
            if pd.notna(row[sector]):  # Skip NaN values
                cursor.execute("SELECT sector_id FROM Sector WHERE sector_name = ?;", (sector,))
                sector_id = cursor.fetchone()[0]
                value = row[sector]
                cursor.execute("""
                    INSERT INTO CountrySectorData (country_id, sector_id, year, value)
                    VALUES (?, ?, ?, ?);
                """, (country_id, sector_id, year, value))
    
    conn.commit()
    conn.close()
    print(f"Database '{db_name}' has been created and populated successfully.")



In [None]:
csv_file_path = "pay_gap_Europe.csv"
database_name = "pay_gap_data_normalized.db" 

create_and_populate_database(csv_file_path, database_name)

## Read into a database

In [14]:
def read_table(database_name:str, request:str)->list:
    """This function reads a table from a SQLite database.
    args:
        database_name: str: the name of the SQLite database
        request: str: the SQL request to execute
    returns:
        result: list of tuples: the result of the SQL request
    """
    with sqlite3.connect(database_name) as conn:
        cur = conn.cursor()
        cur.execute(request)
        result = cur.fetchall()
        return result

In [15]:
query_table_country = "SELECT * from Country"

In [None]:
read_table('pay_gap_data_normalized.db',query_table_country)

## Read from SQLite to pandas

In [17]:
def read_table_pandas(database_name:str, query:str)->pd.DataFrame:
    """This function reads a table from a SQLite database into a pandas DataFrame.
    args:
        database_name: str: the name of the SQLite database
        query: str: the SQL query to execute    
    returns:
        df: pd.DataFrame: the result of the SQL query
    """
    con = sqlite3.connect(database_name)
    df = pd.read_sql_query(query, con)
    con.close()
    return df

In [None]:
read_table_pandas("pay_gap_data_normalized.db",query_table_country)