In [2]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
from dateutil.relativedelta import relativedelta

In [3]:
import glob
import json

In [4]:
# 目标文件夹
folder_path = r"\\prdeqs01\QlikData\Jira_\input"

# 找到所有 CSV 文件（不管名字是什么）
all_files = glob.glob(os.path.join(folder_path, "*.csv"))

# 检查找到的文件
print("找到的 CSV 文件：")
for f in all_files:
    print(f)

# 读取所有 CSV 并合并为一个 DataFrame
df_list = []
for f in all_files:
    df = pd.read_csv(f)
    df['source_file'] = os.path.basename(f)  # 可选：添加一列记录来源文件名
    df_list.append(df)

# 合并成一个大 DataFrame
df_all = pd.concat(df_list, ignore_index=True)

找到的 CSV 文件：
\\prdeqs01\QlikData\Jira_\input\merged_jira.csv


In [5]:
len(df)

732

In [6]:
col_map = {
    'Customfield_10187': 'business_unit',
    'Key': 'issue_number',
    'Issue': 'issue',
    'Status': 'status',
    'History': 'history'
}

In [7]:
# 找到存在的列
existing_cols = [col for col in col_map.keys() if col in df_all.columns]

# 提取并重命名
df_extracted = df_all[existing_cols].rename(columns={k: v for k, v in col_map.items() if k in existing_cols})

# 查看结果
df_extracted.head()

Unnamed: 0,business_unit,issue_number,issue,status,history
0,BMX,BMX-3192,(BMX-3192) Christmas Email Template,Closed,"[{""time"":""Thursday, April 3, 2025 4:02 PM"",""to..."
1,BMX,BMX-3198,(BMX-3198) J3714-Doctors on Demand POS 1920X10...,Closed,"[{""time"":""Thursday, April 3, 2025 4:02 PM"",""to..."
2,BMX,BMX-3207,(BMX-3207) New Year 2025 6 Week Free Promotion,Closed,"[{""time"":""Thursday, April 3, 2025 4:03 PM"",""to..."
3,BMX,BMX-3208,(BMX-3208) NY25 6 Week Free promo - T&C's,Closed,"[{""time"":""Thursday, April 3, 2025 4:14 PM"",""to..."
4,BMX,BMX-3209,(BMX-3209) J3718-NY25 6 Weeks Free promo flyer...,Closed,"[{""time"":""Thursday, April 3, 2025 4:14 PM"",""to..."


In [8]:
expanded_rows = []

for _, row in df_extracted.iterrows():
    history_list = json.loads(row['history'].replace("'", '"'))
    i = 0
    while i < len(history_list):
        if i + 1 < len(history_list):
            # 正常成对
            pair = [history_list[i], history_list[i+1]]
            i += 2
        else:
            # 最后单独一个元素
            pair = [history_list[i]]
            i += 1
        new_row = row.copy()
        new_row['history'] = pair
        expanded_rows.append(new_row)

df_expanded = pd.DataFrame(expanded_rows)


In [9]:
df_expanded['index_in_issue'] = df_expanded.groupby('issue_number').cumcount() + 1

In [10]:
import re

# 假设 df_expanded 已经存在
def extract_history_fields(history_list):
    time_point = None
    duration = None
    to = None
    statusCategory = None
    for h in history_list:
        t = h.get('time')
        if t:
            if re.match(r'^\d+[dhms]', t):
                duration = t
            else:
                time_point = t
        # 提取其他字段
        if 'to' in h:
            to = h['to']
        if 'statusCategory' in h:
            statusCategory = h['statusCategory']
    return pd.Series([time_point, duration, to, statusCategory])

df_expanded[['time_point', 'duration', 'to', 'statusCategory']] = df_expanded['history'].apply(extract_history_fields)


In [11]:
df_expanded['duration_shifted'] = df_expanded.groupby('issue_number')['duration'].shift(1)

In [12]:
df_expanded['time_point'] = pd.to_datetime(
    df_expanded['time_point'], 
    format='%A, %B %d, %Y %I:%M %p'
)

In [13]:
# 假设 df_expanded 已经存在 duration 列
def duration_to_seconds_safe(duration_str):
    if not isinstance(duration_str, str) or not duration_str.strip():
        return 0  # 空值或非字符串返回 0 秒
    
    days = hours = minutes = seconds = 0
    
    try:
        d_match = re.search(r'(\d+)d', duration_str)
        h_match = re.search(r'(\d+)h', duration_str)
        m_match = re.search(r'(\d+)m', duration_str)
        s_match = re.search(r'(\d+)s', duration_str)

        if d_match:
            days = int(d_match.group(1))
        if h_match:
            hours = int(h_match.group(1))
        if m_match:
            minutes = int(m_match.group(1))
        if s_match:
            seconds = int(s_match.group(1))

        total_seconds = days*86400 + hours*3600 + minutes*60 + seconds
        return total_seconds
    except Exception:
        return 0  # 遇到异常格式返回 0

# 应用到 DataFrame
df_expanded['duration_cal'] = df_expanded['duration_shifted'].apply(duration_to_seconds_safe)


In [14]:
df_expanded

Unnamed: 0,business_unit,issue_number,issue,status,history,index_in_issue,time_point,duration,to,statusCategory,duration_shifted,duration_cal
0,BMX,BMX-3192,(BMX-3192) Christmas Email Template,Closed,"[{'time': 'Thursday, April 3, 2025 4:02 PM', '...",1,2025-04-03 16:02:00,136d 4h 20m 20s,Closed,3,,0
0,BMX,BMX-3192,(BMX-3192) Christmas Email Template,Closed,"[{'time': 'Monday, November 18, 2024 11:42 AM'...",2,2024-11-18 11:42:00,6d 22h 53m 52s,Resolved,3,136d 4h 20m 20s,11766020
0,BMX,BMX-3192,(BMX-3192) Christmas Email Template,Closed,"[{'time': 'Monday, November 11, 2024 12:48 PM'...",3,2024-11-11 12:48:00,1m,BMX Review,4,6d 22h 53m 52s,600832
0,BMX,BMX-3192,(BMX-3192) Christmas Email Template,Closed,"[{'time': 'Monday, November 11, 2024 12:47 PM'...",4,2024-11-11 12:47:00,2s,In Progress,4,1m,60
0,BMX,BMX-3192,(BMX-3192) Christmas Email Template,Closed,"[{'time': 'Monday, November 11, 2024 12:47 PM'...",5,2024-11-11 12:47:00,4d 41m 15s,To Do,2,2s,2
...,...,...,...,...,...,...,...,...,...,...,...,...
731,Transformation Office,BMX-3621,(BMX-3621) Internal comms for Westfund+ for Ce...,Resolved,"[{'time': 'Tuesday, July 1, 2025 3:52 PM', 'to...",4,2025-07-01 15:52:00,5s,To Do,2,42s,42
731,Transformation Office,BMX-3621,(BMX-3621) Internal comms for Westfund+ for Ce...,Resolved,"[{'time': 'Tuesday, July 1, 2025 3:51 PM', 'to...",5,2025-07-01 15:51:00,15d 1h 7m 9s,Waiting for support,2,5s,5
731,Transformation Office,BMX-3621,(BMX-3621) Internal comms for Westfund+ for Ce...,Resolved,"[{'time': 'Monday, June 16, 2025 2:44 PM', 'to...",6,2025-06-16 14:44:00,11d 19h 15m 9s,Waiting for customer,2,15d 1h 7m 9s,1300029
731,Transformation Office,BMX-3621,(BMX-3621) Internal comms for Westfund+ for Ce...,Resolved,"[{'time': 'Wednesday, June 4, 2025 7:29 PM', '...",7,2025-06-04 19:29:00,7h 51m 10s,Waiting for support,2,11d 19h 15m 9s,1019709


In [15]:
import pytz


# 目标文件夹
folder_path = r"\\prdeqs01\QlikData\Jira_\output"

# 获取悉尼当前日期
sydney_tz = pytz.timezone("Australia/Sydney")
today_sydney = datetime.now(sydney_tz)
date_str = today_sydney.strftime("%d%m%Y")  # 格式 ddmmyyyy

# 构建文件名
file_name = f"jira_{date_str}.csv"
file_path = os.path.join(folder_path, file_name)

# 保存 DataFrame
df_expanded.to_csv(file_path, index=False)

print(f"文件已保存到: {file_path}")


文件已保存到: \\prdeqs01\QlikData\Jira_\output\jira_02122025.csv
