In [20]:
import turicreate as tc
import pandas as pd
from datetime import datetime, date

In [28]:
data = pd.read_csv('/Users/computerscience/Downloads/obc_for_students.csv')
data = data.set_index('Unnamed: 0')

In [29]:
data

Unnamed: 0_level_0,start,end,stream,new_values,user_id
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2020-03-03 00:00:00,2020-03-03 00:13:00,battery_percentage,73.000000,0.0
1,2020-03-03 00:13:00,2020-03-03 00:26:00,battery_percentage,72.411955,0.0
2,2020-03-03 00:26:00,2020-03-03 00:26:00,battery_percentage,71.823910,0.0
3,2020-03-03 00:26:00,2020-03-03 00:27:00,battery_percentage,71.235865,0.0
4,2020-03-03 00:27:00,2020-03-03 00:27:00,battery_percentage,70.647820,0.0
...,...,...,...,...,...
41195662,2020-03-24 20:25:00,2020-03-24 20:28:00,battery_percentage,66.497742,10149.0
41195663,2020-03-24 20:28:00,2020-03-24 20:29:00,battery_percentage,69.619920,10149.0
41195664,2020-03-24 20:29:00,2020-03-24 20:35:00,battery_percentage,72.248871,10149.0
41195665,2020-03-24 20:35:00,2020-03-24 20:36:00,battery_percentage,75.497742,10149.0


In [30]:
def insertStartAndEndFeatures(start, end):
    dayOfWeekStart = []
    dayOfWeekEnd = []
    timeOfDayStart = []
    timeOfDayEnd = []
    
    for startDateTime in start:
        date_, time = startDateTime.split(' ')
        year, month, day = date_.split('-')
        
        timeOfDayStart.append(time)
        
        weekday = date(int(year), int(month), int(day)).weekday()
        dayOfWeekStart.append(weekday)
        
        
    for endDateTime in end:
        date_, time = endDateTime.split(' ')
        year, month, day = date_.split('-')
        
        timeOfDayEnd.append(time)
        
        weekday = date(int(year), int(month), int(day)).weekday()
        dayOfWeekEnd.append(weekday)
        
    return (dayOfWeekStart, dayOfWeekEnd, timeOfDayStart, timeOfDayEnd)

def addStartAndEndFeatures(df):
    df['day_of_week_start'], df['day_of_week_end'], df['time_start'], df['time_end']  = insertStartAndEndFeatures(df["start"].to_numpy(), df['end'].to_numpy())
    print("Done adding start and end deatures")


In [31]:
def insertBatteryFeatures(stream, value):
    isPluggedIn = False
    pluggedIn = []
    percentage = []
    for i in range(len(stream)):
        streamState, streamValue = stream[i], value[i]
        if streamState == 'plugged_in':
            if streamValue == 1.0: 
                isPluggedIn = True
            else:
                isPluggedIn = False
        percentage.append(streamValue)
        pluggedIn.append(isPluggedIn)
    return (pluggedIn, percentage)

def addBatteryFeatures(df):
    df['pluggedIn'], df['percentage'] = insertBatteryFeatures(df['stream'].to_numpy(), df['new_values'].to_numpy())
    print("Done adding battery deatures")


In [32]:
def getTimeOfDay(time):
    hour = int(time[:2])
    if 0 <= hour < 6: return 'midnight'
    if 6 <= hour < 12: return 'morning'
    if 12 <= hour < 18: return 'afternoon'
    if hour >= 18: return 'night'
    return 'N/A'
        
    
def getInitialCycleDuration(stream, startTime, endTime):
    #Did not account that initial may span multiple days
    FMT = '%H:%M:%S'
    for i in range(len(startTime)):
        streamState, start, end = stream[i], startTime[i], endTime[i]
        if streamState == 'plugged_in':
            tdelta = datetime.strptime(start, FMT) - datetime.strptime("00:00:00", FMT)
            return tdelta.total_seconds()

def getTimeDelta(start, end):
    FMT = '%H:%M:%S'
    #Account for different days where startTime > end Time
    tdelta = datetime.strptime(end, FMT) - datetime.strptime(start, FMT)
    if tdelta.total_seconds() < 0:
        restOfDay = datetime.strptime('23:59:59', FMT) - datetime.strptime(start, FMT)
        nextDay = datetime.strptime(end, FMT) - datetime.strptime('00:00:00', FMT)
        return restOfDay.total_seconds() + nextDay.total_seconds()
    else:
        return tdelta.total_seconds()
    
        
def insertDurationFeature(stream, startTime, endTime):
    cycleDuration = getInitialCycleDuration(stream, startTime, endTime)
    duration = []
    cycleLength = []
    timeOfDay = []
    for i in range(len(startTime)):
        streamState, start, end = stream[i], startTime[i], endTime[i]
        
        timeDifference = getTimeDelta(start, end)
        
        if streamState == 'plugged_in':
            cycleDuration = timeDifference
        
        cycleLength.append(int(cycleDuration))     
        duration.append(int(timeDifference))
        timeOfDay.append(getTimeOfDay(start))
        
    return (duration, cycleLength, timeOfDay)

def addDurationFeature(df):
    df['event_duration'], df['cycle_duration'], df['time_of_day'] = insertDurationFeature(df["stream"].to_numpy(), df["time_start"].to_numpy(), df['time_end'].to_numpy())
    print("Done adding duration deatures")
                             
                             

In [33]:
def rowsToDrop(stream):
    indexesToDrop = []
    for i in range(len(stream)):
        if stream[i] != 'battery_percentage':
            indexesToDrop.append(i)
    return indexesToDrop

def dropPluggedInRows(df):
    df.drop (
        labels = rowsToDrop(df['stream'].to_numpy()),
        axis = 0,
        inplace = True
    )
    df.drop (
        labels = ['stream', 'new_values', 'start', 'end'],
        axis = 1,
        inplace = True
    )
    df.reset_index(drop=True, inplace=True)
    print("Done dropping deatures")

In [34]:
addStartAndEndFeatures(data)
addBatteryFeatures(data)
addDurationFeature(data)
dropPluggedInRows(data)

In [35]:
dataSFrame = tc.SFrame(data)
dataSFrame

user_id,day_of_week_start,day_of_week_end,time_start,time_end,pluggedIn,percentage,event_duration
0.0,1,1,00:00:00,00:13:00,0,73.0,780
0.0,1,1,00:13:00,00:26:00,0,72.41195507496722,780
0.0,1,1,00:26:00,00:26:00,0,71.82391011579443,0
0.0,1,1,00:26:00,00:27:00,0,71.23586512248163,60
0.0,1,1,00:27:00,00:27:00,0,70.6478200950288,0
0.0,1,1,00:27:00,00:38:00,0,70.05977503343598,660
0.0,1,1,00:38:00,01:31:00,0,69.47172993770315,3180
0.0,1,1,01:31:00,01:41:00,0,68.8836848078303,600
0.0,1,1,01:41:00,01:55:00,0,68.29563964381745,840
0.0,1,1,01:55:00,02:15:00,0,67.7075944456646,1200

cycle_duration,time_of_day
17580,midnight
17580,midnight
17580,midnight
17580,midnight
17580,midnight
17580,midnight
17580,midnight
17580,midnight
17580,midnight
17580,midnight


In [36]:
train_data, test_Data = dataSFrame.random_split(0.8)
features = ['user_id', 'day_of_week_start', 'day_of_week_end', 'time_start', 'time_end', 'pluggedIn', 'percentage', 'event_duration', 'time_of_day']
target_feature = 'cycle_duration'

# Linear Regression

In [37]:
linear_regression_model = tc.linear_regression.create(train_data, target= target_feature, features= features)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [38]:
linear_regression_results = linear_regression_model.evaluate(test_Data)
linear_regression_results

{'max_error': 122150.85841608663, 'rmse': 18179.42700135423}

# Boosted Trees Regression

In [45]:
boosted_trees_model = tc.boosted_trees_regression.create(train_data, target=target_feature, features= features)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [46]:
boosted_trees_results = boosted_trees_model.evaluate(test_Data)
boosted_trees_results

{'max_error': 77418.9501953125, 'rmse': 17767.745488042015}