In [1]:
!pip install plotly
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import os
import datetime
import seaborn as sns
import warnings
from matplotlib.cbook import boxplot_stats
warnings.filterwarnings("ignore")
%matplotlib inline
plt.rc('font', family = 'Malgun Gothic')
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.tools as tls




In [2]:
def read_data(file):
    #df=pd.read_csv(io.StringIO(data[file].decode('utf-8')), header=2)
    df=pd.read_csv(file,header=2)
    df=df[['장치 타임스탬프','기록 유형','과거 혈당 mg/dL','메모']]
    df.columns=['장치타임스탬프','기록유형','혈당','메모']
    df['장치타임스탬프']=df['장치타임스탬프'].astype('datetime64[ns]')
    return df

# 메모 매칭 시키기
def split_type(df) :
    df_list = []
    for i in [0, 1, 6]:
        df_list.append(df[df.기록유형 == i])
    return df_list

def memo_match(glu_df, memo_df) :
# 있는 메모만 필터링 / 컬럼 삭제
    memo_df=memo_df[memo_df['메모'].notnull()]
    memo_df.drop(['기록유형','혈당'],axis=1,inplace=True)
    glu_df.drop(['기록유형'],axis=1,inplace=True)
    
    if len(memo_df) != 0 :
        # 결측치 처리, 중복 타임스탬프 메모 join
        memo_df['메모']=memo_df['메모'].fillna('')
        memo_df=memo_df.groupby('장치타임스탬프').agg(메모=('메모', ','.join))
        memo_df['메모']=memo_df['메모'].str.strip(',')
        memo_df=memo_df[memo_df['메모']!='']
        memo_df.reset_index(inplace=True)
        memo_df['인덱스']=memo_df.index
        memo_df.set_index('장치타임스탬프',inplace=True)

    # 혈당 데이터 인덱스를 시간으로 변경
    glu_df.sort_values(by=['장치타임스탬프'],inplace=True)
    glu_df.reset_index(inplace=True)
    glu_df['인덱스']=glu_df.index
    glu_df.drop(['index'],axis=1,inplace=True)
    glu_df.set_index('장치타임스탬프',inplace=True)

    # 인덱스 비교로 메모와 가장 가까운 혈당 데이터 인덱스 탐색
    insert_location=[int(glu_df['인덱스'].truncate(before = idx,axis=0)[:1].values[0]) for idx in memo_df.index]

    # 해당 인덱스에 메모를 태깅
    for location, context in zip(insert_location, memo_df['메모']):
        glu_df.loc[glu_df['인덱스']==location,'메모']=context
    return glu_df

def get_memo(df) :
    df_list = split_type(df)
    return memo_match(df_list[0], df_list[2])

def preprocessing(file) :
    df = read_data(file)
    df = get_memo(df)
    return df

In [3]:
# outlier 판단
def calc_outliers(df):
    outliers = boxplot_stats(df.혈당)[0]['fliers']
    
    outliers.sort()
    outlier_df = df.loc[df.혈당.isin(outliers), :]
    return outlier_df

# TIR 비율
def calc_tir(df):
    low = 0
    target = 0
    high = 0
    for a in df['혈당'].to_list(): 
        if a < 70 :low += 1
        elif a > 180 : high += 1
        else :target += 1
    length = len(np.array(df['혈당']))
    arr = [low, target, high]
    p_list = []
    for a in arr :
        p_list.append(a / length)
    print("low : %.2f \ntarget : %.2f \nhigh : %.2f \n" % (p_list[0], p_list[1], p_list[2]))
    return p_list

def total_graph(file,df) :
    fig = go.Figure()
    trace = go.Scatter(x = df.index, y = df['혈당'], mode = "lines", name = "혈당")
    #outlier 표시
    outlier_df = calc_outliers(df)
    trace2 = go.Scatter(x=outlier_df.index, y = outlier_df['혈당'], mode = 'markers', name = "이상치")
    #평균
    trace3 = go.Scatter(x=df.index, y= [df['혈당'].mean()] * len(df.index), mode = 'lines', name = "평균", marker_color = 'red')
    #time_in_range 표시
    fig.update_layout(
        shapes = [
            dict(
                type = "rect",
                xref = "paper", yref = "y",
                x0 = 0, y0 = "70", x1 = 1, y1 = "180",
                fillcolor = "LightGreen", opacity = 0.2, layer = "below", line_width = 0
            )
        ]
    )
    fig.add_trace(trace)
    fig.add_trace(trace2)
    fig.add_trace(trace3)
    fig.update_layout(xaxis_range=[df.index[0], df.index[-1]])
    fig.update_layout(yaxis_range=[df['혈당'].min() - 50, df['혈당'].max() + 50])
    print(calc_tir(df))
    print(df['혈당'].mean())
#     fig_to_html=file+"_total.html"
#     fig.write_html(fig_to_html)
    fig.show()

In [4]:
def get_minmax_c(df, count) :
    avg = df['혈당'].mean()
    day_df = df.sort_values(by = '혈당')
    day_df['평균차'] = abs(day_df['혈당']-avg)
    result_df = day_df.sort_values(by='평균차', ascending=False)
    if count > 0 : result_df = result_df[:count]
    return result_df

# 앞뒤 h시간 내의 혈당(최대, 최소 몇부분만 추출한 뒤 분산 계산. 단 , target 시간이 겹치는 경우는 제외)
def minmax_var_c(df, i, h, base, count): #(dataframe, 날짜순번, 퍼센트, 시간간격(앞뒤로 더해줌), 분산 cut 기준, 퍼센트or개수)
    date = df.index[0].date()
    if i == 0 : # 전체 혈당
        day_df = df
    else :
        day_df = df[df.index.date == df.index.date[0] + datetime.timedelta(days = i-1)] #일일 혈당
    s_df = get_minmax_c(day_df, count)
    
    #평균차가 큰 것부터 구간 분산 구하기 --> target 시간이 겹치는 경우에는 제외
    temp_list = []
    for e in s_df.index :
        hour = datetime.timedelta(hours = h)
        start = e - hour
        end = e + hour
        interval = pd.date_range(start, end, freq = "1min")
        result_df = day_df[day_df.index.isin(interval)]
        mean = result_df['혈당'].mean()
        var = result_df['혈당'].var()
        temp_list.append([e, start, end, mean, var])
    
    # 분산 기준 sorting
    temp_df = pd.DataFrame.from_records(temp_list, columns = ['target', 'start', 'end', 'avg', 'var'])
    temp_df = temp_df.sort_values(by='var', ascending=False)
    
    var_list=[]
    for e in temp_df['target']:
        # dup 시간대 처리
        dup = False
        for item in var_list :
            if e in pd.date_range(item-datetime.timedelta(hours=h), item+datetime.timedelta(hours=h), freq = "1min") : 
                dup = True
                break
            else : continue
        if dup == True : continue
        var_list.append(e)
    var_df = temp_df[temp_df['target'].isin(var_list)]
    
    ## base보다 큰 값만 남김(total_var? : 전체 분산값)
    var_df = var_df[var_df['var'] > base].sort_values(by = 'var', ascending=False).reset_index().drop(['index'], axis = 1)
    return var_df

def oneday_graph(df, i, h, base, count):
    var_df = minmax_var_c(df, i, h, base, count)
    if(len(var_df)==0):
        return 0
    y_min = df['혈당'].min()
    y_max = df['혈당'].max()
    df = df[df.index.date == df.index.date[0] + datetime.timedelta(days = i-1)]
    fig = go.Figure()
    trace = go.Scatter(x = df.index, y = df['혈당'], mode = "lines", name = "혈당")
    #outlier 표시
    outlier_df = calc_outliers(df)
    trace2 = go.Scatter(x=outlier_df.index, y = outlier_df['혈당'], mode = 'markers', name = "이상치")
    #평균
    trace3 = go.Scatter(x=df.index, y= [df['혈당'].mean()] * len(df.index), mode = 'lines', name = "평균", marker_color = 'red')
    #메모
    memo_df = df[df.메모.isna() == False]
    trace4 = go.Scatter(x=memo_df.index, y = memo_df['혈당'], mode = 'markers',
                        name = "메모", text = memo_df['메모'])

    s_list = []
    #time_in_range 표시
    s_list.append(dict(
                type = "rect",
                xref = "paper", yref = "y",
                x0 = 0, y0 = "70", x1 = 1, y1 = "180",
                fillcolor = "LightGreen", opacity = 0.1, layer = "below", line_width = 0
            ))
    #구간분산 표시
    print(var_df.filter(items = ['target', 'avg','var']))
    for j in range(len(var_df)) :
        s_list.append(dict(
            type = "rect",
            xref = "x", yref = "paper",
            x0 = str(var_df.start[j]), y0 = 0, x1 = str(var_df.end[j]), y1 = 1,
            fillcolor = "Yellow", opacity = 0.3, layer = "below", line_width = 0
        ))
    fig.update_layout(shapes = s_list)
    fig.add_trace(trace)
    fig.add_trace(trace2)
    fig.add_trace(trace3)
    fig.add_trace(trace4)
    
    day=df.index.date[0]
    start = pd.Timestamp(day.year, day.month, day.day, 0)
    end = pd.Timestamp(day.year, day.month, day.day, 23, 59)
    fig.update_layout(xaxis_range=[start, end])
    fig.update_layout(yaxis_range=[y_min - 50, y_max + 50])
    #fig.show()
    return fig

In [5]:
#전체 일자별
def daily_graphs(file, df, base) :
    days = (df.index.date[-1] - df.index.date[0]).days + 2
    for i in range(1,days) :
        fig=oneday_graph(df, i, 1, base, -1)
        if(fig==0):
            continue
        fig_to_html=file.split('_')[0]+"_"+str(i)+"_oneday.html"
        fig.write_html(fig_to_html)
        
#메모가 있는 일자만
def memo_graphs(file, df, base) :
    start = df.index.date[0]
    memo_arr = list(set(df[df.메모.isna() == False].index.date))
    memo_arr.sort()
    cnt=0
    #fig = tls.make_subplots(rows=len(memo_arr), cols=1)
    for a in memo_arr :
        day = (a-start).days + 1
        fig=oneday_graph(df, day, 1, base, -1)
        if(fig==0):
            continue
#         fig_to_html=file.split('_')[0]+"_"+str(a)+"_oneday.html"
#         fig.write_html(fig_to_html)
        cnt+=1
    print(file +" : "+ str(cnt))

In [6]:
# 상위 10개 분산 구하기
def show_var(df, h, base, count) :
    days = (df.index.date[-1] - df.index.date[0]).days + 2
    var_list = []
    for i in range(1, days) :
        var_df = minmax_var_c(df, i, h, base, count)
        var_df['day_num'] = i
        if len(var_df) != 0 :
            var_list.append(var_df)
    result_df = pd.concat(var_list)
    result_df=result_df.sort_values(by='var', ascending=False).reset_index(drop=True)
    return result_df

# 상위 10개 그래프 구하기
def draw_top(origin, file, top, day_list, i) :
    y_min = origin['혈당'].min()
    y_max = origin['혈당'].max()
    df = origin[origin.index.date == day_list[i]]
    var_df = top[top.index.date == day_list[i]]
    fig = go.Figure()
    trace = go.Scatter(x = df.index, y = df['혈당'], mode = "lines", name = "혈당")
    #outlier 표시
    outlier_df = calc_outliers(df)
    trace2 = go.Scatter(x=outlier_df.index, y = outlier_df['혈당'], mode = 'markers', name = "이상치")
    #평균
    trace3 = go.Scatter(x=df.index, y= [df['혈당'].mean()] * len(df.index), mode = 'lines', name = "평균", marker_color = 'red')
    #메모
    memo_df = df[df.메모.isna() == False]
    trace4 = go.Scatter(x=memo_df.index, y = memo_df['혈당'], mode = 'markers',
                        name = "메모", text = memo_df['메모'])

    s_list = []
    #time_in_range 표시
    s_list.append(dict(
                type = "rect",
                xref = "paper", yref = "y",
                x0 = 0, y0 = "70", x1 = 1, y1 = "180",
                fillcolor = "LightGreen", opacity = 0.1, layer = "below", line_width = 0
            ))
    #구간분산 표시
    print(var_df.filter(items = ['target', 'avg','var']))
    for j in range(len(var_df)) :
        s_list.append(dict(
            type = "rect",
            xref = "x", yref = "paper",
            x0 = str(var_df.start[j]), y0 = 0, x1 = str(var_df.end[j]), y1 = 1,
            fillcolor = "Yellow", opacity = 0.3, layer = "below", line_width = 0
        ))
    fig.update_layout(shapes = s_list)
    fig.add_trace(trace)
    fig.add_trace(trace2)
    fig.add_trace(trace3)
    fig.add_trace(trace4)
        
    day=df.index.date[0]
    start = pd.Timestamp(day.year, day.month, day.day, 0)
    end = pd.Timestamp(day.year, day.month, day.day, 23, 59)
    fig.update_layout(xaxis_range=[start, end])
    fig.update_layout(yaxis_range=[y_min - 50, y_max + 50])
    fig_to_html=file.split('_')[0]+"_"+str(day)+"_oneday.html"
    fig.write_html(fig_to_html)
    fig.show()

def top10_graph(origin, file, top, i) :
    top = top.set_index('target')
    day_list = list(set(top.index.date))
    day_list.sort()
    draw_top(origin, file, top, day_list, i) 

In [10]:
filelist=['data/en이_glucose_2020-8-13.csv',
          'data/yj박_glucose_2020-7-21.csv',
          'data/jg윤_glucose_2020-8-19.csv',
          'data/dh김_glucose_2020-8-13.csv',
         'data/sk이_glucose_2020-8-13.csv']

In [17]:
df=preprocessing(filelist[2])
base=df['혈당'].var()
var_df=show_var(df, 1, 600, 10)
var_df[var_df['var']>4000]

Unnamed: 0,target,start,end,avg,var,day_num
0,2020-04-13 20:55:00,2020-04-13 19:55:00,2020-04-13 21:55:00,225.375,7076.839286,1
1,2020-04-28 15:07:00,2020-04-28 14:07:00,2020-04-28 16:07:00,283.625,5447.696429,16
2,2020-06-03 21:39:00,2020-06-03 20:39:00,2020-06-03 22:39:00,210.125,5440.125,52
3,2020-06-01 13:11:00,2020-06-01 12:11:00,2020-06-01 14:11:00,109.125,4867.267857,50
4,2020-04-14 21:54:00,2020-04-14 20:54:00,2020-04-14 22:54:00,225.0,4523.428571,2
5,2020-05-16 18:36:00,2020-05-16 17:36:00,2020-05-16 19:36:00,224.222222,4495.194444,34
6,2020-05-05 18:01:00,2020-05-05 17:01:00,2020-05-05 19:01:00,193.75,4463.357143,23
7,2020-06-10 22:58:00,2020-06-10 21:58:00,2020-06-10 23:58:00,191.444444,4377.027778,59
8,2020-04-18 14:12:00,2020-04-18 13:12:00,2020-04-18 15:12:00,179.0,4142.666667,6


In [18]:
base

1661.799420195768

In [None]:
[[top10_graph(preprocessing(file), file, show_var(preprocessing(file), 1, 600, 10)[:3], i) for i in range(3)] for file in filelist]