In [None]:
# pip install seaborn

Note: you may need to restart the kernel to use updated packages.


In [91]:
import pyodbc
import pandas as pd
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from datetime import datetime, timedelta

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [92]:
def get_secret():
    credential = DefaultAzureCredential()
    secret_client = SecretClient(vault_url="https://kvsynapsedata001.vault.azure.net", credential=credential)
    secret = secret_client.get_secret("pw-SRVSQLPoolSQL")
    return secret.value
 
 
server = 'tcp:arriba-synapseworkspace-prod-ae-001.sql.azuresynapse.net,1433'
database = 'arribasqlpool1'
username = 'SRV_SQLPool_SQL'
password = get_secret()  
 
otheropts = 'Encrypt=yes;TrustServerCertificate=yes;Connection Timeout=30;'
 
 
cnxn_t = pyodbc.connect('DRIVER={ODBC Driver 18 for SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+password+';'+otheropts)
cursor_t = cnxn_t.cursor()
 
 
 
 

##### Sample Queries

##### AB tables from JRL

In [93]:
queries = {
    "df_emp_df": "SELECT * FROM [df].[EmployeeInfo]",
    "df_leave_df": "SELECT * FROM [df].[EmployeeLeaves]"
}

# 执行查询并存入字典
dfs = {name: pd.read_sql(query, cnxn_t) for name, query in queries.items()}

# 保存文件
for name, df in dfs.items():
    csv_filename = name.replace("_df", "") + ".csv"
    df.to_csv(csv_filename, index=False)


print("所有 DataFrame 已保存为 csv 文件。")


  dfs = {name: pd.read_sql(query, cnxn_t) for name, query in queries.items()}
  dfs = {name: pd.read_sql(query, cnxn_t) for name, query in queries.items()}


所有 DataFrame 已保存为 csv 文件。


In [94]:
leave_ = pd.read_csv("df_leave.csv")
emp_= pd.read_csv("df_emp.csv")

In [95]:
emp_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1113 entries, 0 to 1112
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   EmployeeNumber     1113 non-null   int64 
 1   FirstName          1113 non-null   object
 2   MiddleName         503 non-null    object
 3   LastName           1113 non-null   object
 4   CommonName         689 non-null    object
 5   PreferredLastName  110 non-null    object
 6   FederatedId        1075 non-null   object
 7   BirthDate          1113 non-null   object
 8   StartDate          1113 non-null   object
 9   OriginalHireDate   1113 non-null   object
 10  TerminationDate    498 non-null    object
 11  Pronouns           1113 non-null   object
 12  GenderIdentity     1014 non-null   object
dtypes: int64(1), object(12)
memory usage: 113.2+ KB


In [96]:
def check_and_set_index(df, column_name):
    # 检查重复值
    duplicates = df[column_name][df[column_name].duplicated(keep=False)]
    
    if duplicates.empty:
        df = df.set_index(column_name)
        print(f"No duplicates found. '{column_name}' has been set as index.")
        return df
    else:
        dup_counts = df[column_name].value_counts()
        dup_filtered = dup_counts[dup_counts > 1]
        print(f"Found duplicates in '{column_name}':")
        print(dup_filtered)
        return df  # 保持原样，不设置 index

# 使用方法
emp_ = check_and_set_index(emp_, 'EmployeeNumber')


No duplicates found. 'EmployeeNumber' has been set as index.


In [97]:
emp_

Unnamed: 0_level_0,FirstName,MiddleName,LastName,CommonName,PreferredLastName,FederatedId,BirthDate,StartDate,OriginalHireDate,TerminationDate,Pronouns,GenderIdentity
EmployeeNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
80277,Alexandra,,Barnard,Ally,,abarnard@rehabmanagement.com.au,1998-07-24T00:00:00,2021-06-07T00:00:00,2021-06-07T00:00:00,2025-07-11T23:59:00,Alexandra Barnard,Female
80191,Tracy-Lee,,Lucas,Tracy,,tlucas@rehabmanagement.com.au,1980-05-14T00:00:00,2016-08-15T00:00:00,2016-08-15T00:00:00,,Tracy-Lee Lucas,Female
80941,Emy,,Liu,,,eliu@rehabmanagement.com.au,1994-12-09T00:00:00,2025-02-03T00:00:00,2025-02-03T00:00:00,,Emy Liu,Female
80883,Hayley,Louise,Vallance,,,hvallance@rehabmanagement.com.au,1993-05-30T00:00:00,2024-12-02T00:00:00,2024-12-02T00:00:00,,Hayley Vallance,Female
80323,Joanne,Marie,Crerar,Joanne,,JCrerar@rehabmanagement.com.au,1974-08-15T00:00:00,2022-04-04T00:00:00,2022-04-04T00:00:00,,Joanne Crerar,Female
...,...,...,...,...,...,...,...,...,...,...,...,...
81046,Dimity,,Hicks,,,,1900-01-01T00:00:00,2025-04-28T00:00:00,2025-04-28T00:00:00,,Dimity Hicks,
80727,Kethya,Hirushi,Wickramanayake,Keth,,kwickramanayake@rehabmanagement.com.au,2001-02-06T00:00:00,2024-08-05T00:00:00,2024-08-05T00:00:00,2024-09-26T23:59:00,Kethya Wickramanayake,Female
81056,Lin,,Foo,,,lfoo@rehabmanagement.com.au,2002-09-30T00:00:00,2025-04-28T00:00:00,2025-04-28T00:00:00,2025-07-04T23:59:00,Lin Foo,
80085,Hajara,,Diyab,Hajara,,HDiyab@livebig.com.au,1996-03-12T00:00:00,2023-08-21T00:00:00,2023-08-21T00:00:00,2024-12-03T23:59:00,Hajara Diyab,Female


In [98]:
# 过滤包含 Dominique 或 O’Connell 的行
filtered_emp = emp_[emp_['Pronouns'].str.contains("Dominique|O'Connell", case=False, na=False)]

# 显示结果
filtered_emp.reset_index(inplace=True)

In [99]:
leave_.head()

Unnamed: 0,EmployeeNumber,DateofRequest,TimeStart,TimeEnd,TAFWXRefCode,LeaveType,NetHours,AllDay
0,80949,2025-07-02T12:21:00,2025-07-01T02:30:00,2025-07-01T05:00:00,,AU Sick and Carer's Leave,2.5,
1,60030,2025-06-24T11:01:00,2025-06-27T12:00:00,2025-06-27T17:00:00,60030_ANNUAL_LEAVE_HOURS_202506271200,AU Annual Leave,4.1,
2,80388,2025-06-24T11:02:00,2025-07-03T00:00:00,2025-07-05T00:00:00,80388_ANNUAL_LEAVE_HOURS_202507030000,AU Annual Leave,15.2,True
3,80842,2025-06-26T15:37:00,2025-06-26T09:00:00,2025-06-26T13:00:00,,AU Sick and Carer's Leave,4.0,
4,60077,2025-06-24T11:01:00,2025-06-26T14:00:00,2025-06-26T17:00:00,60077_ANNUAL_LEAVE_HOURS_202506261400,AU Annual Leave,3.0,


In [100]:
leave_filtered = leave_[leave_['EmployeeNumber'].isin(filtered_emp['EmployeeNumber'])].merge(
    filtered_emp[['EmployeeNumber', 'Pronouns']], on='EmployeeNumber'
).rename(columns={'Pronouns': 'Full Name'})


In [101]:
leave_filtered

Unnamed: 0,EmployeeNumber,DateofRequest,TimeStart,TimeEnd,TAFWXRefCode,LeaveType,NetHours,AllDay,Full Name
0,80832,2025-06-25T12:56:00,2025-06-25T08:30:00,2025-06-25T11:00:00,,AU Sick and Carer's Leave,2.5,,Hannah O'Connell
1,80832,2025-07-01T07:42:00,2025-09-15T00:00:00,2025-09-20T00:00:00,,AU Unpaid Sick and Carer's Leave,38.0,True,Hannah O'Connell
2,80832,2025-07-09T09:39:00,2025-07-07T00:00:00,2025-07-09T00:00:00,,AU Mandatory Training Leave,15.2,True,Hannah O'Connell
3,80832,2025-07-11T09:32:00,2025-11-14T00:00:00,2025-11-22T00:00:00,,AU Annual Leave,45.6,True,Hannah O'Connell
4,80832,2025-07-16T17:08:00,2025-08-08T00:00:00,2025-08-09T00:00:00,,AU You Day,7.6,True,Hannah O'Connell
5,80185,2025-06-24T11:01:00,2025-06-27T00:00:00,2025-06-28T00:00:00,80185_YOU_DAY_202506270000,AU You Day,7.6,True,Dominique Higgins
6,80185,2025-07-21T10:25:00,2025-11-04T00:00:00,2025-11-12T00:00:00,,AU Annual Leave,45.6,True,Dominique Higgins
7,80185,2025-07-21T10:27:00,2025-08-08T00:00:00,2025-08-09T00:00:00,,AU You Day,7.6,True,Dominique Higgins
8,80185,2025-07-21T10:30:00,2025-10-24T00:00:00,2025-10-25T00:00:00,,AU Annual Leave,7.6,True,Dominique Higgins
9,80185,2025-07-23T08:51:00,2025-07-22T13:00:00,2025-07-22T16:30:00,,AU Sick and Carer's Leave,3.5,,Dominique Higgins


In [102]:
leave_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   EmployeeNumber  10 non-null     int64  
 1   DateofRequest   10 non-null     object 
 2   TimeStart       10 non-null     object 
 3   TimeEnd         10 non-null     object 
 4   TAFWXRefCode    1 non-null      object 
 5   LeaveType       10 non-null     object 
 6   NetHours        10 non-null     float64
 7   AllDay          8 non-null      object 
 8   Full Name       10 non-null     object 
dtypes: float64(1), int64(1), object(7)
memory usage: 800.0+ bytes


In [103]:
leave_filtered['TimeStart'] = pd.to_datetime(leave_filtered['TimeStart'], errors='coerce')

In [104]:
leave_filtered

Unnamed: 0,EmployeeNumber,DateofRequest,TimeStart,TimeEnd,TAFWXRefCode,LeaveType,NetHours,AllDay,Full Name
0,80832,2025-06-25T12:56:00,2025-06-25 08:30:00,2025-06-25T11:00:00,,AU Sick and Carer's Leave,2.5,,Hannah O'Connell
1,80832,2025-07-01T07:42:00,2025-09-15 00:00:00,2025-09-20T00:00:00,,AU Unpaid Sick and Carer's Leave,38.0,True,Hannah O'Connell
2,80832,2025-07-09T09:39:00,2025-07-07 00:00:00,2025-07-09T00:00:00,,AU Mandatory Training Leave,15.2,True,Hannah O'Connell
3,80832,2025-07-11T09:32:00,2025-11-14 00:00:00,2025-11-22T00:00:00,,AU Annual Leave,45.6,True,Hannah O'Connell
4,80832,2025-07-16T17:08:00,2025-08-08 00:00:00,2025-08-09T00:00:00,,AU You Day,7.6,True,Hannah O'Connell
5,80185,2025-06-24T11:01:00,2025-06-27 00:00:00,2025-06-28T00:00:00,80185_YOU_DAY_202506270000,AU You Day,7.6,True,Dominique Higgins
6,80185,2025-07-21T10:25:00,2025-11-04 00:00:00,2025-11-12T00:00:00,,AU Annual Leave,45.6,True,Dominique Higgins
7,80185,2025-07-21T10:27:00,2025-08-08 00:00:00,2025-08-09T00:00:00,,AU You Day,7.6,True,Dominique Higgins
8,80185,2025-07-21T10:30:00,2025-10-24 00:00:00,2025-10-25T00:00:00,,AU Annual Leave,7.6,True,Dominique Higgins
9,80185,2025-07-23T08:51:00,2025-07-22 13:00:00,2025-07-22T16:30:00,,AU Sick and Carer's Leave,3.5,,Dominique Higgins


In [105]:
def keep_latest(df, id_col, time_col):
    return df.sort_values(time_col, ascending=False).drop_duplicates(id_col)

In [106]:
latest_leave = keep_latest(leave_filtered, id_col='EmployeeNumber', time_col='TimeStart')

In [107]:
latest_leave

Unnamed: 0,EmployeeNumber,DateofRequest,TimeStart,TimeEnd,TAFWXRefCode,LeaveType,NetHours,AllDay,Full Name
3,80832,2025-07-11T09:32:00,2025-11-14,2025-11-22T00:00:00,,AU Annual Leave,45.6,True,Hannah O'Connell
6,80185,2025-07-21T10:25:00,2025-11-04,2025-11-12T00:00:00,,AU Annual Leave,45.6,True,Dominique Higgins


In [108]:
def keep_latest_this_month(df, id_col, time_col):
    # 获取当前年月
    current_year = pd.Timestamp.now().year
    current_month = pd.Timestamp.now().month

    # 过滤出本月的数据
    df_month = df[(df[time_col].dt.year == current_year) & (df[time_col].dt.month == current_month)]

    # 按时间降序排序并保留每个员工最新的一行
    return df_month.sort_values(time_col, ascending=False).drop_duplicates(id_col)

In [109]:
latest_leave_current_month = keep_latest_this_month(leave_filtered, id_col='EmployeeNumber', time_col='TimeStart')

In [110]:
latest_leave_current_month

Unnamed: 0,EmployeeNumber,DateofRequest,TimeStart,TimeEnd,TAFWXRefCode,LeaveType,NetHours,AllDay,Full Name
4,80832,2025-07-16T17:08:00,2025-08-08,2025-08-09T00:00:00,,AU You Day,7.6,True,Hannah O'Connell
7,80185,2025-07-21T10:27:00,2025-08-08,2025-08-09T00:00:00,,AU You Day,7.6,True,Dominique Higgins
