# Data Processing

## Load Data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os
import plotly.graph_objects as go

In [None]:
user = 'P3016/'
route = '/content/drive/My Drive/DP_data processing/'+user
# 경로 불러올 때 route+('파일이름.csv') 쓰면 됨

In [None]:
in_folder = os.listdir(route)

aUSE_files = []
aUSE_ENENT_files = []
dEE_files = []
for file in in_folder:
  if "AppUsageStatEntity" in file:
    aUSE_files.append(file)
  if "AppUsageEventEntity" in file:
    aUSE_ENENT_files.append(file)
  if "DeviceEventEntity" in file:
    dEE_files.append(file)

# 정렬
aUSE_files = sorted(aUSE_files)
aUSE_ENENT_files = sorted(aUSE_ENENT_files)
dEE_files = sorted(dEE_files)

# Doyun Data Processing
1. app_usage_time.csv
2. app_usage_hour.csv

get_app_usage_df: read_csv 데이터의 preprocessing <br/>
i = 몇번째 파일을 사용할 것인가 (0 ~ len(aUSE_files)-1 = 6) <br/>
프로세싱 내용
1. 중복된 timestamp 제거
2. 변화량이 없는 redundant 데이터 제거
3. 측정 기준 시간을 00:00 으로 변경
4. timestamp를 to_datetime으로 변경

## app usage data correction

In [None]:
def get_app_usage_df(i):
  app_usage_df = pd.read_csv(route+aUSE_files[i])
  app_usage_df = app_usage_df.loc[(app_usage_df.isSystemApp == False) | (app_usage_df.packageName.isin(['com.android.chrome']))] 
  app_usage_df = app_usage_df.drop(['isUpdatedSystemApp', 'isSystemApp', 'lastTimeUsed'], axis=1)
  revised_app_usage_df = pd.DataFrame(columns=app_usage_df.columns)
  # 3. 측정 기준 시간을 00:00 으로 변경
  for name, group in app_usage_df.groupby('name'):
    #1. 중복된 timestamp 제거
    group = group.drop_duplicates(subset='timestamp', keep='first')
    #2. 변화량이 없는 redundant 데이터 제거
    group = group.loc[(group['totalTimeForeground'].diff().fillna(0) != 0) | (group['totalTimeForeground'].diff(-1).fillna(0) != 0)]
    diff = group['totalTimeForeground'].diff().fillna(0)
    if group.shape[0] == 0 : continue
    decreasing_mask = (diff < 0)
    max_in_middle = 0
    start_total_time = -group.iloc[0]['totalTimeForeground']
    is_decrese_detected = False
    group_row_len = group.shape[0]
    for index, row in group.iterrows():
      if decreasing_mask[index]:
        start_total_time += max_in_middle
        is_decrese_detected = True
      elif not is_decrese_detected:
        max_in_middle = group.loc[index,'totalTimeForeground']
      group.loc[index,'totalTimeForeground'] += start_total_time;
    revised_app_usage_df = pd.concat([revised_app_usage_df, group])
  # 4. timestamp를 to_datetime으로 변경
  revised_app_usage_df['timestamp'] = pd.to_datetime(revised_app_usage_df['timestamp'], unit='ms')
  return revised_app_usage_df

모든 파일을 preprocessing 된 dataframe list로 변경

In [None]:
app_usage_df_list = [get_app_usage_df(i) for i in range(len(aUSE_files))] 

In [60]:
app_usage_df_list[0]

Unnamed: 0,timestamp,name,packageName,startTime,endTime,totalTimeForeground
481,2019-04-30 04:01:57.477,Chrome,com.android.chrome,1556535340861,1556596886222,0
495,2019-04-30 04:06:57.477,Chrome,com.android.chrome,1556535340861,1556597200467,193689
529,2019-04-30 04:16:57.477,Chrome,com.android.chrome,1556535340861,1556597200467,193689
549,2019-04-30 04:21:57.477,Chrome,com.android.chrome,1556535340861,1556598143666,195127
711,2019-04-30 05:01:57.477,Chrome,com.android.chrome,1556535340861,1556598143666,195127
...,...,...,...,...,...,...
2961,2019-04-30 13:26:57.477,캘린더,com.google.android.calendar,1556621878318,1556631247623,152862
3111,2019-04-30 14:16:57.477,캘린더,com.google.android.calendar,1556621878318,1556631247623,152862
3129,2019-04-30 14:21:57.477,캘린더,com.google.android.calendar,1556621878318,1556634894342,157340
3371,2019-04-30 15:26:57.477,캘린더,com.google.android.calendar,1556621878318,1556634894342,157340


## get_app_usage_time
- app_usage_time.csv를 위한 dataframe을 만드는 함수
- only_top5 = applist 중 top5만 가져올 것인가 (default = False)

In [52]:
def get_app_usage_time(only_top5 = False):
  app_usage_time_total = pd.DataFrame(columns=['date'])
  for app_usage_df_day2 in app_usage_df_list:
    today = app_usage_df_day2.iloc[0]['timestamp'].date()
    app_usage_df_day2 = app_usage_df_day2.groupby(['name', 'packageName']).agg({'totalTimeForeground': 'max'}).reset_index()
    others_total = 0
    if (only_top5):
      total_time_package_name = app_usage_df_day2.groupby('packageName').agg({'totalTimeForeground': 'max'}).reset_index();
      top_package_names = total_time_package_name.nlargest(5, 'totalTimeForeground').packageName.tolist()
      top_package_names_global = top_package_names
      others_total = app_usage_df_day2.loc[~app_usage_df_day2['packageName'].isin(top_package_names), 'totalTimeForeground'].sum() / 60000
      app_usage_df_day2 = app_usage_df_day2[app_usage_df_day2['packageName'].isin(top_package_names)]
    app_usage_df_day2.drop('packageName', axis=1)
    new_df = pd.DataFrame(columns=app_usage_df_day2['name'].unique())
    new_df.loc[0] = app_usage_df_day2['totalTimeForeground'].values / 60000
    if (only_top5): new_df['Others'] = others_total
    new_df['Total'] = new_df.sum(axis=1);
    new_df['date'] = today
    app_usage_time_total = pd.concat([app_usage_time_total, new_df], join='outer')
  if (not only_top5): app_usage_time_total = app_usage_time_total.fillna(0)
  return app_usage_time_total

In [53]:
get_app_usage_time().head()

Unnamed: 0,date,Chrome,Facebook,Instagram,KAIST\tPortal,Logger,Paco,Polar Beat,Puffin,원터치알림,카카오톡,캐시워크,캘린더,Total,과학적인 계산기,ABC Logger,원터치개인
0,2019-04-30,11.12685,42.693883,3.612883,1.6272,0.523567,5.504433,3.76215,4.585933,0.06365,131.2196,14.861717,2.673383,222.25525,0.0,0.0,0.0
0,2019-05-01,0.75845,42.110617,0.0,0.0,0.060233,2.767183,0.930333,0.0,0.231567,42.041483,8.714517,0.922267,102.053267,3.516617,0.0,0.0
0,2019-05-02,2.59105,20.049433,0.0,0.0,0.633833,5.196333,1.135883,0.0,0.408383,68.590317,14.35635,1.836967,115.651183,0.0,0.852633,0.0
0,2019-05-03,1.644467,30.289183,0.0,0.050133,0.845933,4.756817,4.442217,0.0,0.377833,70.656483,7.98415,0.359883,121.923083,0.0,0.0,0.515983
0,2019-05-04,1.121667,500.172833,0.0,0.0,0.315283,2.71275,2.682417,0.0,0.0,27.079583,6.419883,0.205967,540.710383,0.0,0.0,0.0


In [None]:
get_app_usage_time(True).head()

Unnamed: 0,date,Chrome,Facebook,Paco,카카오톡,캐시워크,Others,Total,과학적인 계산기,Polar Beat
0,2019-04-30,11.12685,42.693883,5.504433,131.2196,14.861717,16.848767,222.25525,,
0,2019-05-01,,42.110617,2.767183,42.041483,8.714517,2.90285,102.053267,3.516617,
0,2019-05-02,2.59105,20.049433,5.196333,68.590317,14.35635,4.8677,115.651183,,
0,2019-05-03,,30.289183,4.756817,70.656483,7.98415,3.794233,121.923083,,4.442217
0,2019-05-04,,500.172833,2.71275,27.079583,6.419883,1.642917,540.710383,,2.682417


## get_app_usage_weekly
- 주간 top5 앱들에 대한 기록을 가져오는 함수

In [None]:
def get_app_usage_weekly():
    app_usaeg_time = get_app_usage_time()
    weekly_top_apps = app_usaeg_time.drop(['date', 'Total'], axis=1).sum()
    # print(weekly_top_apps.nlargest(5))
    weekly_top_apps_list = ['date', 'Total'] + weekly_top_apps.nlargest(5).index.to_list()
    df_dropped = app_usaeg_time.drop(columns = weekly_top_apps_list)
    df_dropped_sum = df_dropped.sum(axis=1)
    app_usaeg_time = app_usaeg_time[weekly_top_apps_list]
    app_usaeg_time['Others'] = df_dropped_sum
    return app_usaeg_time

In [None]:
weekly_df = get_app_usage_weekly()
weekly_df

Unnamed: 0,date,Total,Facebook,카카오톡,캐시워크,Chrome,Paco,Others
0,2019-04-30,222.25525,42.693883,131.2196,14.861717,11.12685,5.504433,16.848767
0,2019-05-01,102.053267,42.110617,42.041483,8.714517,0.75845,2.767183,5.661017
0,2019-05-02,115.651183,20.049433,68.590317,14.35635,2.59105,5.196333,4.8677
0,2019-05-03,121.923083,30.289183,70.656483,7.98415,1.644467,4.756817,6.591983
0,2019-05-04,540.710383,500.172833,27.079583,6.419883,1.121667,2.71275,3.203667
0,2019-05-05,258.607333,130.474117,90.925217,14.521317,12.6497,6.008717,4.028267
0,2019-05-06,194.203233,113.39225,56.020683,16.05195,2.966167,3.067117,2.705067


## get_app_usage_hour
- app_usage_hour.csv에 해당하는 df 를 반환하는 함수<br/>
- i = 몇번째 파일을 사용할 것인가 (0 ~ len(aUSE_files)-1 = 6)

In [None]:
import datetime as dt
def get_app_usage_hour(i):
  app_usage_hour_df= app_usage_df_list[i]
  total_time_package_name = app_usage_hour_df.groupby('packageName').agg({'totalTimeForeground': 'max'}).reset_index();
  top_package_names = total_time_package_name.nlargest(5, 'totalTimeForeground').packageName.tolist()
  print(top_package_names)
  app_usage_hour_df = app_usage_hour_df[app_usage_hour_df['packageName'].isin(top_package_names)]
  app_usage_hour_df['totalTimeForeground'] /= 60000
  app_usage_hour_df_pivot = app_usage_hour_df.pivot(index='timestamp', columns='name', values='totalTimeForeground').reset_index();
  return app_usage_hour_df_pivot

In [None]:
get_app_usage_hour(1).head()

['com.facebook.katana', 'com.kakao.talk', 'com.cashwalk.cashwalk', 'com.realmax.calc', 'com.pacoapp.paco']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app_usage_hour_df['totalTimeForeground'] /= 60000


name,timestamp,Facebook,Paco,과학적인 계산기,카카오톡,캐시워크
0,2019-05-01 00:16:57.477,0.0,,,0.0,0.0
1,2019-05-01 00:21:57.477,8.886967,,,0.174433,0.257083
2,2019-05-01 02:16:57.477,,,,0.174433,0.257083
3,2019-05-01 02:21:57.477,,,,0.642717,0.560967
4,2019-05-01 03:51:57.477,8.886967,,,0.642717,0.560967


## csv export

In [None]:
user_saved = 'processed_'+user
file_saved_route = '/content/drive/My Drive/DP_data processing/'+user_saved
os.makedirs(file_saved_route, exist_ok=True)

In [None]:
def df_to_csv(df, file_name):
  df.to_csv(file_saved_route+file_name+'.csv')

In [None]:
date_list = ['2019_04_30', '2019_05_01', '2019_05_02', '2019_05_03', '2019_05_04', '2019_05_05', '2019_05_06']

df_to_csv(get_app_usage_time(True), 'app_usage_time')
df_to_csv(get_app_usage_weekly(), 'app_usage_weekly')
for i in range(len(app_usage_df_list)):
  df_to_csv(get_app_usage_hour(i), 'app_usage_hour_'+date_list[i])

['com.kakao.talk', 'com.facebook.katana', 'com.cashwalk.cashwalk', 'com.android.chrome', 'com.pacoapp.paco']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app_usage_hour_df['totalTimeForeground'] /= 60000


['com.facebook.katana', 'com.kakao.talk', 'com.cashwalk.cashwalk', 'com.realmax.calc', 'com.pacoapp.paco']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app_usage_hour_df['totalTimeForeground'] /= 60000


['com.kakao.talk', 'com.facebook.katana', 'com.cashwalk.cashwalk', 'com.pacoapp.paco', 'com.android.chrome']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app_usage_hour_df['totalTimeForeground'] /= 60000


['com.kakao.talk', 'com.facebook.katana', 'com.cashwalk.cashwalk', 'com.pacoapp.paco', 'fi.polar.beat']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app_usage_hour_df['totalTimeForeground'] /= 60000


['com.facebook.katana', 'com.kakao.talk', 'com.cashwalk.cashwalk', 'com.pacoapp.paco', 'fi.polar.beat']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app_usage_hour_df['totalTimeForeground'] /= 60000


['com.facebook.katana', 'com.kakao.talk', 'com.cashwalk.cashwalk', 'com.android.chrome', 'com.pacoapp.paco']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app_usage_hour_df['totalTimeForeground'] /= 60000


['com.facebook.katana', 'com.kakao.talk', 'com.cashwalk.cashwalk', 'com.pacoapp.paco', 'com.android.chrome']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app_usage_hour_df['totalTimeForeground'] /= 60000


# Data Processing Min

In [82]:
def get_app_usage_top5(i):
  app_usage_df=pd.read_csv(route+aUSE_files[i])
  ## app_usage_df = app_usage_df.loc[app_usage_df.isSystemApp == False] 유튜브도 시스템앱임...
  app_usage_df = app_usage_df.drop(['startTime', 'endTime', 'isUpdatedSystemApp', 'isSystemApp', 'lastTimeUsed'], axis=1)
  app_usage_df['timestamp'] = pd.to_datetime(app_usage_df['timestamp'], unit='ms')
  total_time_package_name = app_usage_df.groupby('packageName').agg({'totalTimeForeground': 'max'}).reset_index()
  top_package_names = total_time_package_name.nlargest(5, 'totalTimeForeground').packageName.tolist()
  return top_package_names

### 앱 접속횟수

In [83]:
top_package_names =get_app_usage_top5(0)
print(top_package_names)

['com.kakao.talk', 'com.facebook.katana', 'com.lge.launcher3', 'com.google.android.apps.docs.editors.docs', 'com.cashwalk.cashwalk']


In [84]:
def get_access_app_count_df(i):
  access_app_df=pd.read_csv(route+aUSE_ENENT_files[i])
  access_app_df = access_app_df.drop(['isSystemApp','isUpdatedSystemApp'], axis=1)
  access_app_df.loc[~access_app_df['packageName'].isin(top_package_names), 'name'] = 'others'
  access_app_df = access_app_df.loc[access_app_df['type'] == 'MOVE_TO_FOREGROUND']
  access_app_count_df = access_app_df.groupby('name').count().reset_index()
  access_app_count_df = access_app_count_df.rename(columns={'type': 'number_of_access'}).drop(['timestamp','packageName'], axis=1)
  return access_app_count_df

In [85]:
access_app_count_df = get_access_app_count_df(1)

fig = go.Figure()
#Make a basic line plot which shows the trend of Co2 emissions
fig.add_trace(go.Bar(x=access_app_count_df['name'], y=access_app_count_df['number_of_access']))


fig.update_layout(
    title='Number of Access'
)

In [86]:
dfs = [get_access_app_count_df(i) for i in range(len(aUSE_ENENT_files))]

# 생성된 데이터프레임들을 concat 함수를 사용하여 합치기
df_concat = pd.concat(dfs, axis=0, keys=[i for i in range(len(aUSE_ENENT_files))], names=['Key']).reset_index()

# 결과 출력
df_concat

Unnamed: 0,Key,level_1,name,number_of_access
0,0,0,Facebook,71
1,0,1,others,410
2,0,2,기본홈,247
3,0,3,문서,9
4,0,4,카카오톡,1059
5,0,5,캐시워크,568
6,1,0,Facebook,40
7,1,1,others,193
8,1,2,기본홈,163
9,1,3,카카오톡,738


In [88]:
import plotly.express as px


#Make a basic line plot which shows the trend of Co2 emissions
fig = px.bar(df_concat, x='Key', y='number_of_access', color='name',
             title='Stacked Bar Chart',
             barmode='stack')

fig.show()

### 일주일 UNLOCK 횟수

In [89]:
for i in range(0, len(dEE_files)):
  screen_on_df=pd.read_csv(route+dEE_files[i])
  screen_on_df = screen_on_df.groupby('type').count().reset_index()
  screen_on_df = screen_on_df.rename(columns={'timestamp': 'count'})
  print(screen_on_df[screen_on_df['type'] == 'UNLOCK']['count'].values[0])

149
127
132
86
31
102
111


# Data Processing Taehyeong 

- 사이즈가 너무 커서 현재 실행 중 종료

## Total_user_usage.csv



DeviceEventEntity 파일들 하나로 합치기



In [90]:
import os
import pandas as pd
from datetime import timezone, timedelta, datetime

utc_timezone = timezone.utc
tz = timezone(timedelta(hours=9))
in_folder = os.listdir(route)

dEE_files = []
for file in in_folder:
  if "DeviceEventEntity" in file:
    dEE_files.append(file)

# 정렬
dEE_files = sorted(dEE_files)
df_csv_append = pd.DataFrame()

for file in dEE_files:
    df = pd.read_csv(route+file)
    df_csv_append = df_csv_append.append(df, ignore_index=True)



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.




### 1. 평균 세션당 사용량 = 하루 사용량 / 세션

In [91]:
# assuming df_csv_append is your DataFrame
session_df = df_csv_append
# sort values by timestamp
session_df = session_df.sort_values('timestamp')

# calculate the difference between SCREEN_OFF and the previous UNLOCK
session_df['session_time'] = session_df.loc[session_df['type'] == 'SCREEN_OFF', 'timestamp'] - session_df.loc[session_df['type'].shift() == 'UNLOCK', 'timestamp'].shift()

# drop null values and remove negative time differences (in case the events are out of order)
session_df = session_df.dropna(subset=['session_time'])
session_df['timestamp'] = pd.to_datetime(session_df['timestamp'], unit='ms') 

# resample to daily frequency, calculating the mean of the session_time
session_df_daily = session_df.resample('D', on='timestamp').session_time.mean() / 60000
session_df_daily = session_df_daily.round().astype(int)

# reset the index (which contains the date) and rename the columns for clarity
session_df_daily = session_df_daily.reset_index()
session_df_daily.columns = ['Date', 'AverageSessionDuration']
print(session_df_daily)

        Date  AverageSessionDuration
0 2019-04-30                       6
1 2019-05-01                       6
2 2019-05-02                      14
3 2019-05-03                       4
4 2019-05-04                      79
5 2019-05-05                      14
6 2019-05-06                      10


### 2. 하루 사용량

In [92]:
# assuming df_csv_append is your DataFrame
usage_df = df_csv_append
# print(usage_df)
# sort values by timestamp
usage_df = usage_df.sort_values('timestamp')

# calculate the difference between SCREEN_OFF and the previous UNLOCK
usage_df['usage_time'] = usage_df.loc[usage_df['type'] == 'SCREEN_OFF', 'timestamp'] - usage_df.loc[usage_df['type'].shift() == 'SCREEN_ON', 'timestamp'].shift()
# drop null values and remove negative time differences (in case the events are out of order)
usage_df = usage_df.dropna(subset=['usage_time'])
usage_df['timestamp'] = pd.to_datetime(usage_df['timestamp'], unit='ms')

# resample to daily frequency, summing up the usage_time
usage_df_daily = usage_df.resample('D', on='timestamp').usage_time.sum() / 60000
# print(usage_df_daily)
usage_df_daily = usage_df_daily.round().astype(int)

# reset the index (which contains the date) and rename the columns for clarity
usage_df_daily = usage_df_daily.reset_index()
usage_df_daily.columns = ['Date', 'TotalUsageTime']

# print(usage_df_daily)

### 3-1. make a total_usage_time.csv file

In [93]:
mean_usage_time = usage_df_daily['TotalUsageTime'].mean().round().astype(int)
mean_session_time = session_df_daily['AverageSessionDuration'].mean().round().astype(int)
user_id = user.rstrip('/')

concat_df = pd.DataFrame({'User': [user_id], 'usage_time': [mean_usage_time], 'session_duration_time': [mean_session_time]})
print(concat_df)

    User  usage_time  session_duration_time
0  P3016        1047                     19


iteration for all users(세부 기능은 위의 코드들이랑 같음)

In [95]:
import os
import pandas as pd
from datetime import timezone, timedelta, datetime

utc_timezone = timezone.utc
tz = timezone(timedelta(hours=9))

folder_route = '/content/drive/My Drive/DP_data processing/'
in_main_folder = os.listdir(folder_route)

result_dfs = []
for folder in in_main_folder:
  if folder == 'processed_P3016': continue
  user = folder+'/'
  print(user)
  route = folder_route+user
  in_folder = os.listdir(route)

  aUSE_files = []
  for file in in_folder:
    if "DeviceEventEntity" in file:
      aUSE_files.append(file)

  dEE_files = sorted(dEE_files)
  df_csv_append = pd.DataFrame()

  for file in dEE_files:
      df = pd.read_csv(route+file)
      df_csv_append = df_csv_append.append(df, ignore_index=True)

  session_df = df_csv_append
  print(session_df)
  session_df = session_df.sort_values('timestamp')
  session_df['session_time'] = session_df.loc[session_df['type'] == 'SCREEN_OFF', 'timestamp'] - session_df.loc[session_df['type'].shift() == 'UNLOCK', 'timestamp'].shift()
  session_df = session_df.dropna(subset=['session_time'])
  session_df['timestamp'] = pd.to_datetime(session_df['timestamp'], unit='ms') 

  session_df_daily = session_df.resample('D', on='timestamp').session_time.mean() / 60000
  session_df_daily = session_df_daily.dropna()
  session_df_daily = session_df_daily.round().astype(int)
  session_df_daily = session_df_daily.reset_index()
  session_df_daily.columns = ['Date', 'AverageSessionDuration']

  usage_df = df_csv_append
  usage_df = usage_df.sort_values('timestamp')
  usage_df['usage_time'] = usage_df.loc[usage_df['type'] == 'SCREEN_OFF', 'timestamp'] - usage_df.loc[usage_df['type'].shift() == 'UNLOCK', 'timestamp'].shift()
  usage_df = usage_df.dropna(subset=['usage_time'])
  usage_df['timestamp'] = pd.to_datetime(usage_df['timestamp'], unit='ms') 

  usage_df_daily = usage_df.resample('D', on='timestamp').usage_time.sum() / 60000
  # print(usage_df_daily.astype(float).round())
  usage_df_daily = usage_df_daily.astype(float).round().astype(int)
  usage_df_daily = usage_df_daily.reset_index()
  usage_df_daily.columns = ['Date', 'TotalUsageTime']

  mean_usage_time = usage_df_daily['TotalUsageTime'].mean().round().astype(int)
  mean_session_time = session_df_daily['AverageSessionDuration'].mean().round().astype(int)
  user_id = user.rstrip('/')

  concat_df = pd.DataFrame({'User': [user_id], 'usage_time': [mean_usage_time], 'session_duration_time': [mean_session_time]})
  result_dfs.append(concat_df)

final_df = pd.concat(result_dfs)

P3041/
          timestamp        type
0     1556582644835   SCREEN_ON
1     1556583362272  SCREEN_OFF
2     1556583542432   SCREEN_ON
3     1556583665387  SCREEN_OFF
4     1556584204093   SCREEN_ON
...             ...         ...
1962  1557168735622   SCREEN_ON
1963  1557168747405  SCREEN_OFF
1964  1557175946783   SCREEN_ON
1965  1557175977779      UNLOCK
1966  1557177189568  SCREEN_OFF

[1967 rows x 2 columns]
P3025/
          timestamp        type
0     1556582638311   SCREEN_ON
1     1556582644658      UNLOCK
2     1556584045521  SCREEN_OFF
3     1556584207349   SCREEN_ON
4     1556584209924      UNLOCK
...             ...         ...
3714  1557146847895   SCREEN_ON
3715  1557146849155      UNLOCK
3716  1557146889821  SCREEN_OFF
3717  1557147654100   SCREEN_ON
3718  1557147656962      UNLOCK

[3719 rows x 2 columns]
P3029/
          timestamp        type
0     1556582474331   SCREEN_ON
1     1556582477667  SCREEN_OFF
2     1556582499924   SCREEN_ON
3     1556582499968      UNLOCK
4


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

P3030/
          timestamp        type
0     1556582442270   SCREEN_ON
1     1556582448042      UNLOCK
2     1556582460536  SCREEN_OFF
3     1556582629806   SCREEN_ON
4     1556582633248      UNLOCK
...             ...         ...
4726  1557149339453   SCREEN_ON
4727  1557149339877      UNLOCK
4728  1557149500403  SCREEN_OFF
4729  1557149521296      UNLOCK
4730  1557149521549   SCREEN_ON

[4731 rows x 2 columns]
P3028/
          timestamp        type
0     1556582540054   SCREEN_ON
1     1556582542012      UNLOCK
2     1556582549685  SCREEN_OFF
3     1556582704191   SCREEN_ON
4     1556582704828      UNLOCK
...             ...         ...
3879  1557151645304   SCREEN_ON
3880  1557151647123      UNLOCK
3881  1557151684701  SCREEN_OFF
3882  1557151779142   SCREEN_ON
3883  1557151780217      UNLOCK

[3884 rows x 2 columns]
P3018/



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

          timestamp                type
0     1556583623489           SCREEN_ON
1     1556583632551              UNLOCK
2     1556583809900  POWER_DISCONNECTED
3     1556583812809          SCREEN_OFF
4     1556583822389           SCREEN_ON
...             ...                 ...
3608  1557183013337          SCREEN_OFF
3609  1557184846520           SCREEN_ON
3610  1557184854211          SCREEN_OFF
3611  1557185411546           SCREEN_ON
3612  1557185416076          SCREEN_OFF

[3613 rows x 2 columns]
P3016/
          timestamp             type
0     1556583033374        SCREEN_ON
1     1556583045004       SCREEN_OFF
2     1556584354942        SCREEN_ON
3     1556584366740       SCREEN_OFF
4     1556584769997        SCREEN_ON
...             ...              ...
2866  1557187155885           UNLOCK
2867  1557187156171        SCREEN_ON
2868  1557187174641       SCREEN_OFF
2869  1557187175531        SCREEN_ON
2870  1557187177400  TURN_OFF_DEVICE

[2871 rows x 2 columns]
P3014/



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



          timestamp        type
0     1556582484694  SCREEN_OFF
1     1556582720914   SCREEN_ON
2     1556582722669      UNLOCK
3     1556582810636  SCREEN_OFF
4     1556582811394   SCREEN_ON
...             ...         ...
2118  1556855847487   SCREEN_ON
2119  1556855847793      UNLOCK
2120  1556855861989  SCREEN_OFF
2121  1556856102297      UNLOCK
2122  1556856102321   SCREEN_ON

[2123 rows x 2 columns]
P3019/



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



ParserError: ignored

In [None]:
final_df.to_csv('/content/drive/My Drive/total_user_usage.csv', index=False)

## total_usage_time_whole.csv

In [None]:
import os
import pandas as pd
from datetime import timezone, timedelta, datetime

utc_timezone = timezone.utc
tz = timezone(timedelta(hours=9))

folder_route = '/content/drive/My Drive/DP_data processing/'
in_main_folder = os.listdir(folder_route)

result_dfs = []
for folder in in_main_folder:
  if folder == 'processed_P3016': continue
  user = folder+'/'
  print(user)
  route = folder_route+user
  in_folder = os.listdir(route)

  dEE_files = []
  for file in in_folder:
    if "DeviceEventEntity" in file:
      dEE_files.append(file)

  dEE_files = sorted(dEE_files)
  df_csv_append = pd.DataFrame()

  for file in dEE_files:
      df = pd.read_csv(route+file)
      df_csv_append = df_csv_append.append(df, ignore_index=True)

  usage_df = df_csv_append
  usage_df = usage_df.sort_values('timestamp')
  usage_df['usage_time'] = usage_df.loc[usage_df['type'] == 'SCREEN_OFF', 'timestamp'] - usage_df.loc[usage_df['type'].shift() == 'UNLOCK', 'timestamp'].shift()
  usage_df = usage_df.dropna(subset=['usage_time'])
  usage_df['timestamp'] = pd.to_datetime(usage_df['timestamp'], unit='ms') 

  usage_df_daily = usage_df.resample('D', on='timestamp').usage_time.sum() / 60000
  usage_df_daily = usage_df_daily.astype(float).round().astype(int)
  usage_df_daily = usage_df_daily.reset_index()
  usage_df_daily.columns = ['Date', 'TotalUsageTime']

  user_id = user.rstrip('/')

  usage_df_daily['User'] = user_id
  result_dfs.append(usage_df_daily)

final_df = pd.concat(result_dfs)

### 수정된 whole 코드

In [None]:
def get_total_usage_df_with_df(app_usage_df):
  app_usage_df = app_usage_df.loc[(app_usage_df.isSystemApp == False) | (app_usage_df.packageName.isin(['com.android.chrome']))] 
  app_usage_df = app_usage_df.drop(['isUpdatedSystemApp', 'isSystemApp', 'lastTimeUsed'], axis=1)
  app_usage_df['totalTimeForeground'] = app_usage_df['totalTimeForeground'].astype('int64')
  revised_app_usage_df = pd.DataFrame(columns=app_usage_df.columns)
  # 3. 측정 기준 시간을 00:00 으로 변경
  for name, group in app_usage_df.groupby('name'):
    #1. 중복된 timestamp 제거
    group = group.drop_duplicates(subset='timestamp', keep='first')
    #2. 변화량이 없는 redundant 데이터 제거
    group = group.loc[(group['totalTimeForeground'].diff().fillna(0) != 0) | (group['totalTimeForeground'].diff(-1).fillna(0) != 0)]
    diff = group['totalTimeForeground'].diff().fillna(0)
    if group.shape[0] == 0 : continue
    decreasing_mask = (diff < 0)
    max_in_middle = 0
    start_total_time = -group.iloc[0]['totalTimeForeground']
    is_decrese_detected = False
    group_row_len = group.shape[0]
    for index, row in group.iterrows():
      if decreasing_mask[index]:
        start_total_time += max_in_middle
        is_decrese_detected = True
      elif not is_decrese_detected:
        max_in_middle = group.loc[index,'totalTimeForeground']
      group.loc[index,'totalTimeForeground'] += start_total_time;
    revised_app_usage_df = pd.concat([revised_app_usage_df, group])
  # 4. timestamp를 to_datetime으로 변경
  revised_app_usage_df['timestamp'] = pd.to_datetime(revised_app_usage_df['timestamp'], unit='ms')
  return revised_app_usage_df

In [None]:
import os
import pandas as pd
from datetime import timezone, timedelta, datetime

utc_timezone = timezone.utc
tz = timezone(timedelta(hours=9))

folder_route = '/content/drive/My Drive/DP_data processing/'
in_main_folder = os.listdir(folder_route)

result_dfs = []
passed = 20;
error_user = 'P1526/'
for folder in in_main_folder:
  user = folder+'/'
  # if error_user != user:
  #   continue;
  print(user)
  route = folder_route+user
  in_folder = os.listdir(route)

  files = []
  for file in in_folder:
    if "AppUsageStatEntity" in file:
      files.append(file)

  files = sorted(files)
  df_csv_append = pd.DataFrame()

  for file in files:
    origin_df = pd.read_csv(route+file);
    ## print(df.head())
    if origin_df.empty: 
      print('empty')
      print(origin_df.head())
      continue
    df = get_total_usage_df_with_df(origin_df)
    date = df.iloc[0]['timestamp'].date()
    total_time_by_app = df.groupby('packageName').agg({'totalTimeForeground': 'max'}).reset_index();
    user_id = user.rstrip('/')
    data_dict = [[ date, total_time_by_app.sum()['totalTimeForeground'] / 60000, user_id ]]
    new_df = pd.DataFrame(data_dict, columns=['Date', 'TotalUsageTime', 'User'])
    result_dfs.append(new_df)

In [None]:
final_df = pd.concat(result_dfs)
final_df['TotalUsageTime'] = final_df['TotalUsageTime'].round().astype(int)
final_df

In [None]:
final_df.to_csv('/content/drive/My Drive/DP_data processing/processed_P3016/total_user_usage_whole.csv', index=False)

In [None]:
usage_final_df = pd.concat(result_dfs)
usage_final_df['TotalUsageTime'] = final_df['TotalUsageTime'].round()
mean_usage_time = usage_final_df.groupby('User')['TotalUsageTime'].mean().round().astype(int)
usage_final_df = pd.DataFrame({'User': mean_usage_time.index, 'usage_time': mean_usage_time}).reset_index(drop=True)
print(usage_final_df)

# final_df

In [None]:
import os
import pandas as pd
from datetime import timezone, timedelta, datetime

utc_timezone = timezone.utc
tz = timezone(timedelta(hours=9))

folder_route = '/content/drive/My Drive/DP_data processing/'
in_main_folder = os.listdir(folder_route)

result_dfs = []
for folder in in_main_folder:
  if folder == 'processed_P3016': continue
  user = folder+'/'
  print(user)
  route = folder_route+user
  in_folder = os.listdir(route)

  dEE_files = []
  for file in in_folder:
    if "DeviceEventEntity" in file:
      dEE_files.append(file)

  dEE_files = sorted(dEE_files)
  df_csv_append = pd.DataFrame()

  for file in dEE_files:
      df = pd.read_csv(route+file)
      df_csv_append = df_csv_append.append(df, ignore_index=True)

  session_df = df_csv_append
  # print(session_df)
  session_df = session_df.sort_values('timestamp')
  session_df['session_time'] = session_df.loc[session_df['type'] == 'SCREEN_OFF', 'timestamp'] - session_df.loc[session_df['type'].shift() == 'UNLOCK', 'timestamp'].shift()
  session_df = session_df.dropna(subset=['session_time'])
  session_df['timestamp'] = pd.to_datetime(session_df['timestamp'], unit='ms') 

  session_df_daily = session_df.resample('D', on='timestamp').session_time.mean() / 60000
  session_df_daily = session_df_daily.dropna()
  session_df_daily = session_df_daily.round().astype(int)
  session_df_daily = session_df_daily.reset_index()
  session_df_daily.columns = ['Date', 'AverageSessionDuration']

  mean_session_time = session_df_daily['AverageSessionDuration'].mean().round().astype(int)
  user_id = user.rstrip('/')

  concat_df = pd.DataFrame({'User': [user_id], 'session_duration_time': [mean_session_time]})
  result_dfs.append(concat_df)

session_final_df = pd.concat(result_dfs)

In [None]:
session_final_df = session_final_df.reset_index(drop=True)
final_df = pd.merge(usage_final_df, session_final_df, on='User')
final_df

In [None]:
final_df.to_csv('/content/drive/My Drive/DP_data processing/processed_P3016/total_user_usage.csv', index=False)

##goal_states.csv

In [96]:
import os
import pandas as pd
from datetime import timezone, timedelta, datetime
import numpy as np

folder_route = '/content/drive/My Drive/DP_data processing/processed_P3016/'
# in_main_folder = os.listdir(folder_route)
total_usage_df = pd.read_csv(folder_route+'total_user_usage_whole.csv')
unlock_df = pd.read_csv(folder_route+'unlocks.csv')
app_df = pd.read_csv(folder_route+'app_usage_weekly.csv')

group_states = total_usage_df[total_usage_df['User'] == 'P3016']
group_states = group_states.drop('User', axis=1)
group_states['total_usage_goal'] = [570, 780, 240, 270, 570, 420, 840]
unlock_values = unlock_df.iloc[:, 1].tolist()
group_states['unlock_real'] = unlock_values
group_states['unlock_goal'] = [70, 100, 80, 120, 60, 100, 120]
group_states['app_usage_app'] = ['카카오톡', 'Facebook', 'Paco', 'Facebook', 'Facebook', '카카오톡', 'Paco']
group_states['app_usage_real'] = [180, 90, 150, 150, 90, 40, 20]
group_states['app_usage_goal'] = [180, 90, 150, 150, 90, 40, 20]

for index, row in group_states.iterrows():
    app_name = row['app_usage_app']
    
    if app_name in app_df.columns:
        app_usage_real = app_df.loc[0, app_name]
        group_states.at[index, 'app_usage_real'] = int(app_usage_real.round())

group_states = group_states.reset_index(drop=True)
# print(group_states)
group_states.to_csv('/content/drive/My Drive/DP_data processing/processed_P3016/goal_states.csv', index=False)

# Testing or Unusable Code

## Task 3
1. 하루 전체 사용시간 (task2랑 중복) 
2. 수면 직전 핸드폰 사용시간
3. 특정 어플 목표 사용 시간
4. 화면 키는 빈도수 줄이기

In [None]:
import os
import pandas as pd
from datetime import timezone, timedelta, datetime

utc_timezone = timezone.utc
tz = timezone(timedelta(hours=9))
in_folder = os.listdir(route)

### 1. 하루 전체 사용시간

DeviceEventEntity 파일들 불러오기 및 시간 오름차순 정렬



In [None]:
dEE_files = []
for file in in_folder:
  if "DeviceEventEntity" in file:
    dEE_files.append(file)

# 정렬
dEE_files = sorted(dEE_files)

하루 동안 사용시간

In [None]:
# 모든 DeviceEventEntity file에 대해해
for file in dEE_files: 
  current_df = pd.read_csv(route+file) # 현재 file
  term = 0 # 사용량 저장할 변수(단위: 1000 * sec)
  timestamp = 0
  is_ON = 0

  for index, row in current_df.iterrows():
    if row['type'] == 'UNLOCK': # UNLOCK 되면 휴대폰 켜져있음(is_ON -> 1)
      is_ON = 1
      timestamp = row['timestamp'] 
    elif row['type'] == 'SCREEN_OFF': # SCREEN_OFF 됐을 때 시간 - UNLOCK 당시 시간을 모두 더함
      if is_ON:
        term += row['timestamp'] - timestamp
        is_ON = 0

  term = term // 1000
  m, s = divmod(term, 60)
  h, m= divmod(m, 60)
  
  print('하루 동안 사용량: {0:2d}시간 {1:2d}분 {2:2d}초'.format(h, m, s))

### 2. 수면 직전 핸드폰 사용시간

### SCREEN_ON/UNLOCK

DeviceEventEntity 파일들 불러오기 및 시간 오름차순 정렬



In [None]:
dEE_files = []
for file in in_folder:
  if "DeviceEventEntity" in file:
    dEE_files.append(file)

# 정렬
dEE_files = sorted(dEE_files)

제일 길었던 SCREEN_OFF -> UNLOCK 시간
(각 파일이 당일 오전 9시 ~ 익일 오전 8시까지 인 것 같은데 두 파일을 합쳐야할지..)

In [None]:
for file in dEE_files:
  current_df = pd.read_csv(route+file)
  term = 0
  timestamp = 0
  is_OFF = 0
  ts_bf_sleep = 0 # 잠들기 전 timestamp
  ts_aft_wake = 0 # 깨고 난 후 timestamp

  for index, row in current_df.iterrows():
    if row['type'] == 'UNLOCK': # screen이 꺼져있는 상태에서 UNLOCK 됐을 때 사이 기간이 제일 긴 구간을 저장장
      if (is_OFF and (term < (row['timestamp'] - timestamp))):
        term = row['timestamp'] - timestamp
        ts_bf_sleep = timestamp
        ts_aft_wake = row['timestamp']
      is_OFF = 0
    elif row['type'] == 'SCREEN_OFF': # SCREEN_OFF 된 시간을 저장장
      if not is_OFF:
        timestamp = row['timestamp']
      is_OFF = 1
  
  datetime_bf_sleep = datetime.fromtimestamp(ts_bf_sleep/1000, tz)
  datetime_aft_wake = datetime.fromtimestamp(ts_aft_wake/1000, tz)
  print(term, '자기 전 시간: ', datetime_bf_sleep, ' 깬 시간: ', datetime_aft_wake)

### ENTER_STILL/EXIT_STILL

PhysicalActivityTransitionEntity 파일들 불러오기 및 시간 오름차순 정렬



In [None]:
pATE_files = []
for file in in_folder:
  if "PhysicalActivityTransitionEntity" in file:
    pATE_files.append(file)

# 정렬
pATE_files = sorted(pATE_files)

제일 길었던 ENTER_STILL -> EXIT_STILL 시간

In [None]:
for file in pATE_files:
  current_df = pd.read_csv(route+file)
  term = 0
  timestamp = 0
  is_STILL = 0
  ts_bf_sleep = 0 # 잠들기 전 timestamp
  ts_aft_wake = 0 # 깨고 난 후 timestamp

  for index, row in current_df.iterrows():
    if row['transitionType'] == 'EXIT_STILL': # ENTER_STILL ~ EXIT_STILL이 제일 긴 구간을 저장장
      if (is_STILL and (term < (row['timestamp'] - timestamp))): 
        term = row['timestamp'] - timestamp
        ts_bf_sleep = timestamp
        ts_aft_wake = row['timestamp']
      is_STILL = 0
    elif row['transitionType'] == 'ENTER_STILL': # ENTER_STILL에 들어간 시간을 저장장
      if not is_STILL:
        timestamp = row['timestamp']
      is_STILL = 1
  
  datetime_bf_sleep = datetime.fromtimestamp(ts_bf_sleep/1000, tz)
  datetime_aft_wake = datetime.fromtimestamp(ts_aft_wake/1000, tz)
  print(term, '자기 전 시간: ', datetime_bf_sleep, ' 깬 시간: ', datetime_aft_wake)

### STILL 유지 시간

PhysicalActivityEventEntity 파일들 불러오기 및 시간 오름차순 정렬



In [None]:
pAEE_files = []
for file in in_folder:
  if "PhysicalActivityEventEntity" in file:
    pAEE_files.append(file)

# 정렬
pAEE_files = sorted(pAEE_files)
# pAEE_files

제일 길었던 STILL 구간(적당한 count 기준 수를 모르겠음..)

In [None]:
for file in pAEE_files:
  current_df = pd.read_csv(route+file)
  term = 0
  timestamp = 0
  is_STILL = 0
  ts_bf_sleep = 0
  ts_aft_wake = 0
  count = 0 # STILL 사이에 들어올 다른 type들을 몇개나 허용할건지

  for index, row in current_df.iterrows():
    if row['confidence'] < 0.5: # confidence 0.5 미만인 것들은 무시
      continue
    if row['type'] == 'STILL': # STILL 상태
      if (is_STILL): # 그 전에도 STILL 이었으면 그대로 유지
        is_STILL = 1
      else: # 아닌 경우에는 STILL에 들어간 시각을 저장
        timestamp = row['timestamp']
        is_STILL = 1
    else: # STILL 이외 상태
      if (is_STILL): # 이 전에 STILL이었으면 count 만큼의 오차는 허용해서 계속 STILL로 인식
        if count < 5:
          count += 1
        else:  # count 오차를 벗어났으면 STILL이 유지된 구간 저장
          term = max(term, (row['timestamp'] - timestamp))
          ts_bf_sleep = timestamp
          ts_aft_wake = row['timestamp']
          count = 0
          is_STILL = 0
      else:
        is_STILL = 0

  datetime_bf_sleep = datetime.fromtimestamp(ts_bf_sleep/1000, tz)
  datetime_aft_wake = datetime.fromtimestamp(ts_aft_wake/1000, tz)
  print(term, '자기 전 시간: ', datetime_bf_sleep, ' 깬 시간: ', datetime_aft_wake)

### 3. 특정 어플 목표 사용시간
(일단 너꺼 복붙해서 일자별 어플 사용시간만 놔뒀음)

AppUsageStatEntity 파일 filter 및 정렬

In [None]:
import plotly.graph_objects as go

aUSE_files = []
for file in in_folder:
  if "AppUsageStatEntity" in file:
    aUSE_files.append(file)

# 정렬
aUSE_files = sorted(aUSE_files)

for file in aUSE_files:
  current_df = pd.read_csv(route+file)
  app_usage_df = current_df.drop(['startTime', 'endTime', 'isUpdatedSystemApp', 'isSystemApp', 'lastTimeUsed'], axis=1)

  total_time_package_name = app_usage_df.groupby('packageName').agg({'totalTimeForeground': 'max'}).reset_index();
  top_package_names = total_time_package_name.nlargest(5, 'totalTimeForeground').packageName.tolist()
  app_usage_df.loc[~app_usage_df['packageName'].isin(top_package_names), 'name'] = 'others'

  total_time_name = app_usage_df.groupby('name').agg({'totalTimeForeground': 'max'}).reset_index();
  total_time_name['totalTimeForeground'] /= 3600000

  fig = go.Figure()

  # 그래프에 데이터 추가
  fig.add_trace(go.Bar(x=total_time_name['name'], y=total_time_name['totalTimeForeground']))

  # 그래프 레이아웃 설정
  fig.update_layout(title='Total Time Foreground by Package Name',
                    xaxis_title='Package Name',
                    yaxis_title='Total Time Foreground')

  # 그래프 출력
  fig.show()

  app_usage_top1 = app_usage_df.loc[app_usage_df['packageName'] == top_package_names[0]]
  app_usage_top1.head()

  fig = go.Figure()
  #Make a basic line plot which shows the trend of Co2 emissions
  fig.add_trace(go.Scatter(x=app_usage_top1['timestamp'], 
                          y=app_usage_top1['totalTimeForeground'] / 3600000,
                          mode='markers',
                          ))


  fig.update_layout(
      title='Screen Time: '+top_package_names[0],
  )


### 4. 화면 키는 빈도수 줄이기

In [None]:
dEE_files = []
for file in in_folder:
  if "DeviceEventEntity" in file:
    dEE_files.append(file)

# 정렬
dEE_files = sorted(dEE_files)

for file in dEE_files: 
  current_df = pd.read_csv(route+file) # 현재 file
  screen_on_df = current_df.groupby('type').count().reset_index()
  screen_on_df = screen_on_df.rename(columns={'timestamp': 'count'})
  screen_on_df = screen_on_df