In [1]:
import pandas as pd
from datetime import date
import xlsxwriter

In [2]:
smds = ["Amy/YT Shih", "Ben Li", "Chen Ni/ Charlie Zhou", "Hong Zeng", "Jennie Xie", "Licong He",
       "Yuemei Ding/Perry Zhao", "Xiao Chen", "Xiying Sheng", "Yinghua Zhang", "Yu Chen","Rongrong Zheng"]

In [3]:
file_path = r"C:\Users\carol\Downloads\82963520834 - Attendee Report.csv"
with open(file_path, 'r', encoding="utf-8") as file: 
    for i, line in enumerate(file):
        if line.startswith("Attendee Details"):
            breakline = i #line index where to break the csvfile
df_original = pd.read_csv(file_path, header=breakline+1, index_col=False)
df_original = df_original[["Attended","User Name (Original Name)","Join Time","Leave Time", "Time in Session (minutes)", 
         "Inviter", "Inviter's SMD"]]
print(f"Total {df_original.shape[0]} entries. The first 5 are showing below:")
df_original.head()

Total 38 entries. The first 5 are showing below:


Unnamed: 0,Attended,User Name (Original Name),Join Time,Leave Time,Time in Session (minutes),Inviter,Inviter's SMD
0,Yes,Hong Hu,"Aug 15, 2021 12:55:41","Aug 15, 2021 12:55:57",1,Hong hu,Yuemei Ding/Perry Zhao
1,Yes,James Li,"Aug 15, 2021 12:55:35","Aug 15, 2021 14:21:42",87,Yuemei Ding,Yuemei Ding/Perry Zhao
2,No,laoshi,--,--,--,Maggie,Yu Chen
3,Yes,Angela Liu,"Aug 15, 2021 13:05:33","Aug 15, 2021 14:21:54",77,Angela liu,Yuemei Ding/Perry Zhao
4,Yes,Wei Zhou,"Aug 15, 2021 12:51:11","Aug 15, 2021 14:21:54",91,Yuemei Ding,Yuemei Ding/Perry Zhao


In [4]:
df_smd_not_nan = df_original[~df_original["Inviter's SMD"].isna()]
df_smd_nan = df_original[df_original["Inviter's SMD"].isna()]

print(f"Total {df_smd_nan.shape[0]} attendees whoes Inviter's SMD is NaN. The first 5 (if any) are showing below:")
df_smd_nan.head()

Total 8 attendees whoes Inviter's SMD is NaN. The first 5 (if any) are showing below:


Unnamed: 0,Attended,User Name (Original Name),Join Time,Leave Time,Time in Session (minutes),Inviter,Inviter's SMD
6,Yes,Xia Shi,"Aug 15, 2021 13:18:18","Aug 15, 2021 14:21:49",64.0,,
16,Yes,Ruiming Liu,"Aug 15, 2021 13:51:08","Aug 15, 2021 14:21:45",31.0,,
27,Yes,Hong Chan,"Aug 15, 2021 13:53:42","Aug 15, 2021 14:05:31",12.0,,
28,Yes,Hong Chan,"Aug 15, 2021 13:17:27","Aug 15, 2021 13:53:06",36.0,,
34,Other Attended,,,,,,


In [5]:
#populate the NaN SMD based on User Name that appears before

true_nan_index = [] #those index of true nan: SMD is NaN and NoWay of filling in because it only appears once with SMD NaN
for i in df_smd_nan.index:
    user_name = df_smd_nan.loc[i]["User Name (Original Name)"]
    record_in_df_smd_not_nan = df_smd_not_nan[df_smd_not_nan["User Name (Original Name)"]==user_name]
    if record_in_df_smd_not_nan.shape[0] == 0:
        true_nan_index.append(i)
    elif record_in_df_smd_not_nan.shape[0] > 1:
        record_in_df_smd_not_nan["Join Time"] = pd.to_datetime(record_in_df_smd_not_nan["Join Time"])
        earliest_record_index = record_in_df_smd_not_nan["Join Time"].sort_values().index.values[0]
        missing_smd = record_in_df_smd_not_nan.loc[earliest_record_index]["Inviter's SMD"].values[0]
        df_smd_nan.loc[i].fillna(value={"Inviter's SMD":missing_smd}, inplace=True)
        if pd.isna(df_smd_nan.loc[i]["Inviter"]):
            missing_inviter = record_in_df_smd_not_nan.loc[earliest_record_index]["Inviter"].values[0]
            df_smd_nan.loc[i].fillna(value={"Inviter":missing_inviter}, inplace=True)
    else: #record_in_df_smd_not_nan.shape[0] == 1
        missing_smd = record_in_df_smd_not_nan["Inviter's SMD"].values[0]
        df_smd_nan.loc[i].fillna(value={"Inviter's SMD":missing_smd}, inplace=True)
        if pd.isna(df_smd_nan.loc[i]["Inviter"]):
            missing_inviter = record_in_df_smd_not_nan["Inviter"].values[0]
            df_smd_nan.loc[i].fillna(value={"Inviter":missing_inviter}, inplace=True)

In [6]:
df_smd_comb = pd.concat((df_smd_not_nan, df_smd_nan),axis=0)
df_true_nan = df_original.loc[true_nan_index]

In [7]:
num_output_entries = 0
for name in smds:
    name_replaced = name.replace("/"," or ")
    if df_smd_comb[df_smd_comb["Inviter's SMD"]==name].shape[0] > 0:
        file_name = f"{name_replaced}_{date.today().strftime('%m-%d-%Y')}_Attendee Report.xlsx"
        workbook = xlsxwriter.Workbook(file_name)
        worksheet = workbook.add_worksheet()
        worksheet.write('A1', 'Attendee Details')
        worksheet.write('A3', 'Attended')
        worksheet.write('B3', 'User Name (Original Name)')
        worksheet.write('C3', 'Join Time')
        worksheet.write('D3', 'Leave Time')
        worksheet.write('E3', 'Time in Session (minutes)')
        worksheet.write('F3', 'Inviter')
        worksheet.write('G3', "Inviter's SMD")
        
        df_name = df_smd_comb[df_smd_comb["Inviter's SMD"]==name]
        df_name['Time in Session (minutes)'] = pd.to_numeric(df_name['Time in Session (minutes)'], errors='coerce')
        df_name['Inviter'].fillna("-", inplace=True)
        df_name_groups = []
        for inviter, num_entries in df_name.groupby("Inviter").size().sort_values(ascending=False).iteritems():
            df_inviter = df_name[df_name["Inviter"]==inviter]
            df_inviter.sort_values(by='Time in Session (minutes)', ascending=False, inplace=True)
            df_name_groups.append(df_inviter)
        df_name = pd.concat(df_name_groups, axis=0)
        df_name.reset_index(drop=True, inplace=True)
        df_name.fillna("-", inplace=True)
        for i, row in df_name.iterrows():
            worksheet.write(f'A{i+4}', row['Attended'])
            worksheet.write(f'B{i+4}', row['User Name (Original Name)'])
            worksheet.write(f'C{i+4}', row['Join Time'])
            worksheet.write(f'D{i+4}', row['Leave Time'])
            worksheet.write(f'E{i+4}', row['Time in Session (minutes)'])
            worksheet.write(f'F{i+4}', row['Inviter'])
            worksheet.write(f'G{i+4}', row["Inviter's SMD"])
            num_output_entries += 1
        workbook.close()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_name['Time in Session (minutes)'] = pd.to_numeric(df_name['Time in Session (minutes)'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_inviter.sort_values(by='Time in Session (minutes)', ascending=False, inplace=True)


In [8]:
#write others 
df_true_nan.to_excel(f"Others_{date.today().strftime('%m-%d-%Y')}_Attendee Report.xlsx", index=False)

In [9]:
print(f"The total number of output entries without true Nans are {num_output_entries}.")
print(f"The total number of true Nans are {len(true_nan_index)}.")
print(f"The total number of the above is {num_output_entries + len(true_nan_index)}.")
print(f"The total number of original records are {df_original.shape[0]}.")

The total number of output entries without true Nans are 11.
The total number of true Nans are 4.
The total number of the above is 15.
The total number of original records are 38.
