In [1]:
import pandas as pd
import datetime as dt
from typing import List
import os
import traceback

In [10]:
def generate_summary_csv(input_file: str, output_directory: str, target_date: str, ignore_tags: List[str]):
    if not os.path.exists(input_file):
        print(f"Input file not found: {input_file}")
        return

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    try:
        df = pd.read_csv(input_file, parse_dates=['date', 'created_at', 'updated_at', 'logged_at'])
        
        target_date = pd.to_datetime(target_date)
        start_date = target_date - dt.timedelta(days=(target_date.weekday() + 7))
        end_date = start_date + dt.timedelta(days=7)
        
        print(f"Analyzing data from {start_date.date()} to {end_date.date()}")
        
        # フォルダを作成
        end_date_folder = os.path.join(output_directory, str(end_date.date()))
        os.makedirs(end_date_folder, exist_ok=True)
        
        df_filtered = df[(df['date'] >= start_date) & (df['date'] < end_date)]
        
        for team_name, team_df in df_filtered.groupby('company_name'):
            # gpu_countを考慮したTotal hoursの計算
            team_df['weighted_duration'] = team_df['duration_hour'] * team_df['gpu_count']
            summary = team_df.groupby('project').agg({
                'weighted_duration': 'sum',
                'run_id': 'nunique'
            }).reset_index()
            
            summary.columns = ['project', 'Total hours', 'Total runs']

            # master nodeの計算
            master_node_df = team_df[team_df['gpu_count'] >= 9]
            master_node_counts = master_node_df.groupby('project')['run_id'].nunique().reset_index(name='master_node_runs')
            
            # オーバーラップの計算
            team_df_sorted = team_df.sort_values(['project', 'host_name', 'created_at'])
            team_df_sorted['prev_updated_at'] = team_df_sorted.groupby(['project', 'host_name'])['updated_at'].shift()
            overlap_df = team_df_sorted[team_df_sorted['created_at'] < team_df_sorted['prev_updated_at']]
            overlaps = overlap_df.groupby('project')['run_id'].nunique().reset_index(name='overlap_runs')
            
            # ignore_tagの計算
            team_df['has_ignore_tag'] = team_df['tags'].apply(lambda x: any(tag.strip('[]"\'') in ignore_tags for tag in eval(x)))
            ignore_df = team_df[team_df['has_ignore_tag']]
            ignore_tag_counts = ignore_df.groupby('project')['run_id'].nunique().reset_index(name='ignore_runs')
            
            # 結果のマージ
            result = summary.merge(master_node_counts, on='project', how='left')\
                            .merge(overlaps, on='project', how='left')\
                            .merge(ignore_tag_counts, on='project', how='left')
            result = result.fillna(0)

            # 整数型に変換
            for col in ['Total runs', 'master_node_runs', 'overlap_runs', 'ignore_runs']:
                if col in result.columns:
                    result[col] = result[col].astype(int)
            
            output_file = os.path.join(end_date_folder, f"{team_name}_{end_date.date()}.csv")
            result.to_csv(output_file, index=False)
            print(f"Summary CSV generated for {team_name}: {output_file}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")
        print(traceback.format_exc())

In [26]:
ignore_tags = ["other_gpu", "others_gpu"]
generate_summary_csv("dev/new_runs_df.csv", "dev/weekly_report/", "2024-08-29", ignore_tags)

Analyzing data from 2024-08-19 to 2024-08-26
Summary CSV generated for nii-geniac: dev/weekly_report/2024-08-26/nii-geniac_2024-08-26.csv


In [17]:
import pandas as pd

# CSVファイルを読み込む
df = pd.read_csv('dev/processed_df.csv')

# company_name列の種類ごとにカウント
company_counts = df['company_name'].value_counts()

# 結果を表示
print("会社名ごとの出現回数:")
print(company_counts)

# 合計数も表示
total_count = company_counts.sum()
print(f"\n合計会社数: {total_count}")

会社名ごとの出現回数:
company_name
abeja-geniac        6551
stockmark-geniac    4892
fujitsu-geniac      3530
turing-geniac       2717
nii-geniac           868
kotoba-geniac        537
elyza-geniac         478
sakanaai-geniac        2
Name: count, dtype: int64

合計会社数: 19575
