<a href="https://colab.research.google.com/github/wafibismail/davis-busroutes/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import sklearn as sk
import sklearn.datasets as datasets
import numpy as np
from datetime import datetime

# for plotting
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go
from plotly.offline import iplot
from plotly.subplots import make_subplots

# for preparing data
from sklearn.preprocessing import OneHotEncoder

In [None]:
# LOAD DATA

# initialize a list containing each dataframe containing timestamps for each day
# Bus and route ids as well as place columns are dropped

# If you reset kernel, put these files into the content directory again

df = [
  pd.read_csv('2023-10-01.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-02.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-03.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-04.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-05.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-06.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-07.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-08.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-13.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-14.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-15.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-16.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-17.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-18.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-19.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-20.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-21.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-22.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-23.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-24.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-25.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-26.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-27.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-28.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-29.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1),
  pd.read_csv('2023-10-30.csv').drop(labels={"bus_id","route_id","place1","place2","place3","place4","place5","place6","place7","place8","place9"}, axis=1)
]

# PREPARE AND CLEAN DATA

# Initialize array containing day of the week of each date
# 1st October 2023 is Sunday
# Let {0, 1, ... , 6} represent {Sunday, Monday, Tuesday, ... , Saturday}
days = np.array([1, 2, 3, 4, 5, 6, 7, 8, 13, 14, 15, 16, 17, 18, \
                 19, 20, 21, 22, 22, 23, 24, 25, 26, 27, 28, 29, 30])
days = (days-1)%7

# function to convert time strings to durations in seconds, one column at a time
def to_seconds(column):
  time = pd.to_datetime(column, format= '%H:%M:%S' )
  time = time.fillna(datetime(2024,1,1)) # assign NaN -> 0 for ease of processing
  return (time.dt.hour*60+time.dt.minute)*60 + time.dt.second

# convert the time strings to total duration in seconds
for i in range(len(df)):
  # loop thru each dataframe
  for j in range(df[i].shape[1]):
    # loop thru each column
    df[i][df[i].columns[j]] = to_seconds(df[i][df[i].columns[j]])

# Instantiate dictionary to contain DataFrame columns:
t_dict = {
    'move#': [],
    'move': [],
    'day#': [],
    'day': [],
    'time#': [],
    'time': [],
    'timeA': [],
    'idle_duration': [],
    'move_duration': [],
    'previous_move_duration': [],
    'previous_previous_move_duration': []}
#  move#: number representing coding of the column described next
#  move: label of travel from THIS to NEXT station, e.g. 1->2, 2->3, ..., 8->9
#  day#: number representing day of the week - 0 to 9
#  day : day of the week - Sunday, Monday, Tuesday, ..., Saturday
#  time: time e.g. 8 o'clock, but in seconds e.g. 28800
#  idle_duration: amount of time spent in THIS station
#  move_duration: amount of time spent travelling from THIS to NEXT station

day_names = ['Sunday', 'Monday', 'Tuesday', \
             'Wednesday', 'Thusday', 'Friday', 'Saturday']

timebreaks_source = [6, 8, 10, 12, 13.5, 16, 17.5, 19.5, 21.5, 23.25, 24]
timebreaks = [] # time limits in seconds, used for discretizing time of day
for i in range(len(timebreaks_source)-1):
  timebreaks.append(timebreaks_source[i+1]*60*60)


moveLabels = []
for i in range(8):
  moveLabels.append(str(i+1) + " -> " + str(i+2))

for i in range(len(df)):
  n_rows = df[i].shape[0]
  for j in range(n_rows):
    t_in = df[i].iloc[j,0::2].values
    t_out = df[i].iloc[j,1::2].values

    for k in range(8):
      if all(t_in[k+0:k+2]) and all(t_out[k:k+2]):

        moveNo = k
        moveLabel = moveLabels[k]
        dayNo = days[i]
        day = day_names[dayNo]
        time = np.average([t_in[k+1], t_out[k]])
        # ^ average timestamp between time_in and time_out of one station
        idleDuration = t_out[k]-t_in[k]
        # ^ time out of this station - time in of this station
        moveDuration = t_in[k+1]-t_out[k]
        # ^ time in of next station - time out of this station
        prevMoveDuration = 0
        if k > 0:
          if all(t_in[k-1:k+1]) and all(t_out[k-1:k+1]):
            prevMoveDuration = t_in[k]-t_out[k-1]
        # ^ time in of this station - time out of previous station
        prevPrevMoveDuration = 0
        if k > 1:
          if all(t_in[k-2:k+1]) and all(t_out[k-2:k+1]):
            prevPrevMoveDuration = t_in[k-1]-t_out[k-2]
        # ^ time in of previous station - time out of previous^2 station

        #if (moveDuration) > 1500:
        #  # There is a single outlier that has a move duration of 1500+
        #  continue # Skip adding its details to the dictionary
        ### Do not skip it ; Maybe there was an accident or something
        ### Still important to be included as it is valid context
        ### within the system of road events

        t_dict['move#'].append(moveNo)
        t_dict['move'].append(moveLabel)
        t_dict['day#'].append(dayNo)
        t_dict['day'].append(day)
        t_dict['time'].append(time)
        for l in range(len(timebreaks)):
          if time < timebreaks[l]:
            t_dict['time#'].append(l-1)
            t_dict['timeA'].append(["A", "B", "C", "D", "E", "F", "G", "H", "I"][l-1])
            break
        t_dict['idle_duration'].append(idleDuration)
        t_dict['move_duration'].append(moveDuration)
        t_dict['previous_move_duration'].append(prevMoveDuration)
        t_dict['previous_previous_move_duration'].append(prevPrevMoveDuration)

# Create a dataframe out of the prepared data
# Sort it according to move and day#
t_df = pd.DataFrame(data=t_dict).sort_values(by=['move', 'day#'], ignore_index=True)

# One Hot Encoding for nominal data day and move
enc = OneHotEncoder(sparse_output=False)

move_enc = enc.fit_transform(pd.DataFrame(data=t_df['move#']))
day_enc = enc.fit_transform(pd.DataFrame(data=t_df['day#']))
time_enc = enc.fit_transform(pd.DataFrame(data=t_df['time#']))

for i in range(8):
  t_dict[str(i+1) + " -> " + str(i+2)] = move_enc[:,i]

for i in range(7):
  t_dict[day_names[i]] = day_enc[:,i]

for i in range(9):
  t_dict["time_" + ["A", "B", "C", "D", "E", "F", "G", "H", "I"][i]] = time_enc[:,i]

t_df = pd.DataFrame(data=t_dict).sort_values(by=['move', 'day#'], ignore_index=True)
print(len(t_df.loc[t_df["previous_move_duration"]>0]))
print(len(t_df.loc[t_df["previous_previous_move_duration"]>0]))

828
497


In [None]:
for i in range(len(timebreaks)):
  if i>0:
    tb = timebreaks[i]
    hours = np.floor(tb/60/60)
    minutes = np.floor(tb/60 - hours*60)
    print(f'{round(hours):02}'+":"+f'{round(minutes):02}')


10:00
12:00
13:30
16:00
17:30
19:30
21:30
23:15
24:00


In [None]:
mean_df = pd.DataFrame({
    "Path" : t_dict["move"],
    "seconds" : t_dict["move_duration"]
})
grp = mean_df.groupby("Path").agg({"seconds":{"std", "mean", "median"}})
grp.columns =  ["_".join(col) for col in grp.columns]
grp = grp.reset_index()

from scipy.stats import t
def get_CL(mean, std, limit):
  n = len(std)
  df = n - 1 #degrees of freedom

  confidence_interval = 0.95
  alpha = 1 - confidence_interval

  t_critical = t.ppf(1 - alpha / 2, df)
  margin_of_error = t_critical * (mean / np.sqrt(n))
  if limit == "lower":
    return mean - margin_of_error
  else:
    return mean + margin_of_error

grp["lowerCI"] = get_CL(grp["seconds_mean"],grp["seconds_std"], "lower")
grp["upperCI"] = get_CL(grp["seconds_mean"],grp["seconds_std"], "upper")

fig = go.Figure()
'''
# first I add a trace for every x
fig.add_trace(go.Scatter(x=grp["Path"],
                         y=grp["lowerCI"],
                         mode="markers",
                         showlegend=False,
                         marker=dict(symbol="line-ew-open",
                                     color="grey",
                                     line=dict(width=2),
                                     size=20)))
'''
fig.add_trace(go.Scatter(x=grp["Path"],
                         y=grp["seconds_mean"],
                         mode="markers",
                         showlegend=False,
                         marker=dict(symbol="line-ew-open",
                                     color="grey",
                                     line=dict(width=5),
                                     size=40)))
'''
fig.add_trace(go.Scatter(x=grp["Path"],
                         y=grp["upperCI"],
                         mode="markers",
                         showlegend=False,
                         marker=dict(symbol="line-ew-open",
                                     color="grey",
                                     line=dict(width=2),
                                     size=20)))

# then I add a vertical line for
# every x where y_min!=y_max
for i, row in grp.iterrows():
    #if row["seconds_std"]==row["seconds_std"]:
  fig.add_shape(
      dict(type="line",
            x0=row["Path"],
            x1=row["Path"],
            y0=row["lowerCI"],
            y1=row["upperCI"],
            line=dict(
              color="grey",
              width=4)
          )
  )
'''
fig.update_layout(
    title="Mean Travel Durations by Path", title_x=0.5,
    yaxis_title="Duration (seconds)",
    xaxis_title="Path (from-to bus station)")
fig.update_layout(
    font_family="Times New Roman",
    font_size=30)
fig.show()

print(grp["seconds_mean"])
print(grp["seconds_median"])

0    401.593137
1    106.500000
2    465.870000
3     99.549451
4     90.700855
5    507.213793
6    129.245283
7    168.476190
Name: seconds_mean, dtype: float64
0    398.5
1    100.0
2    453.0
3     92.0
4     65.0
5    463.0
6    116.0
7    144.0
Name: seconds_median, dtype: float64


In [None]:
#t_df.iloc[:,9:33]
t_df

Unnamed: 0,move#,move,day#,day,time#,time,timeA,idle_duration,move_duration,previous_move_duration,...,Saturday,time_A,time_B,time_C,time_D,time_E,time_F,time_G,time_H,time_I
0,0,1 -> 2,0,Sunday,0,29932.5,A,750,925,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1 -> 2,0,Sunday,1,37530.0,B,0,1538,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,1 -> 2,0,Sunday,2,44542.0,C,299,502,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0,1 -> 2,0,Sunday,3,51714.0,D,600,504,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1 -> 2,0,Sunday,4,58443.0,E,1320,340,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204,7,8 -> 9,6,Saturday,3,53626.0,D,21,252,300,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1205,7,8 -> 9,6,Saturday,4,60886.0,E,0,252,96,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1206,7,8 -> 9,6,Saturday,5,68312.0,F,0,140,396,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1207,7,8 -> 9,6,Saturday,6,74559.5,G,17,119,101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
figs = [
    px.scatter(t_df, x="move", y="idle_duration",
               color="move", title="Idle Durations"),
    px.scatter(t_df, x="move", y="move_duration",
               color="move", title="Idle Durations")
]

fig = make_subplots(rows=1, cols=len(figs))

for i, figure in enumerate(figs):
    for trace in range(len(figure["data"])):
        fig.add_trace(figure["data"][trace], row=1, col=i+1)

#iplot(fig)

# Uncomment above to view "subplots"

fig = px.scatter(t_df, x="move", y="idle_duration",
           color="move", title="Idle Durations")
fig.write_html('idle_durations.html', auto_open=True)
fig.show()

fig = px.scatter(t_df, x="move", y="move_duration",
           color="move", title="Move Durations")
fig.write_html('move_durations.html', auto_open=True)
fig.show()

In [None]:
#fig = px.scatter_3d(t_df, x='time', y='day', z='move_duration',
#              color='move', size='move_duration', size_max=18,
#              #symbol='move',
#              opacity=0.7)

# #tight layout
#fig.update_layout(margin={'l':0, 'r':0, 'b':0, 't':0})
#fig.show()

In [None]:
px.scatter(t_df.loc[t_df['day#'] < 4], x = 'time', y = 'move_duration', color = 'move', facet_col='day').show()
px.scatter(t_df.loc[t_df['day#'] >= 4], x = 'time', y = 'move_duration', color = 'move', facet_col='day').show()
px.scatter(t_df, x = 'time', y = 'move_duration', color = 'move').show()

In [None]:
for i in range(len(timebreaks)):
  tb = timebreaks[i]
  hours = np.floor(tb/60/60)
  minutes = np.floor(tb/60 - hours*60)
  print(f'{round(hours):02}'+":"+f'{round(minutes):02}'+" "+str(tb))

               #  #   #   #   #     #    #    #     #    #       #
timebreaks2 = [6, 8, 10, 12, 13.5, 16, 17.5, 19.5, 21.5, 23.25, 24]
for i in range(len(timebreaks2)):
  tb = timebreaks2[i]
  print(str(tb), tb*60*60)

max(t_df["time"])

08:00 28800
10:00 36000
12:00 43200
13:30 48600.0
16:00 57600
17:30 63000.0
19:30 70200.0
21:30 77400.0
23:15 83700.0
24:00 86400
6 21600
8 28800
10 36000
12 43200
13.5 48600.0
16 57600
17.5 63000.0
19.5 70200.0
21.5 77400.0
23.25 83700.0
24 86400


86090.0

In [None]:
t_df

Unnamed: 0,move#,move,day#,day,time#,time,timeA,idle_duration,move_duration,previous_move_duration,...,Saturday,time_A,time_B,time_C,time_D,time_E,time_F,time_G,time_H,time_I
0,0,1 -> 2,0,Sunday,0,29932.5,A,750,925,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1 -> 2,0,Sunday,1,37530.0,B,0,1538,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,1 -> 2,0,Sunday,2,44542.0,C,299,502,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0,1 -> 2,0,Sunday,3,51714.0,D,600,504,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,1 -> 2,0,Sunday,4,58443.0,E,1320,340,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204,7,8 -> 9,6,Saturday,3,53626.0,D,21,252,300,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1205,7,8 -> 9,6,Saturday,4,60886.0,E,0,252,96,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1206,7,8 -> 9,6,Saturday,5,68312.0,F,0,140,396,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1207,7,8 -> 9,6,Saturday,6,74559.5,G,17,119,101,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def show_pred_vs_actual(regressor, X, y, regressor_name):
  y_pred = regressor.predict(X)

  dataset = np.concatenate((X,np.reshape(y, (len(y), -1))), axis=1)
  dataset = np.concatenate((dataset,np.reshape(y_pred, (len(y_pred), -1))), axis=1)

  dataset = dataset[dataset[:, 0].argsort()]

  X = dataset[:, :X.shape[1]]
  y = dataset[:, X.shape[1]]
  y_pred = dataset[:, X.shape[1]+1]

  results_dict = {
      "index": [],
      "move_duration": [],
      "origin": []
  }
  results_dict["index"] = (2*[*range(len(y))])
  results_dict["move_duration"] = (y.tolist() + \
                                  y_pred.tolist())
  results_dict["origin"] = (len(y)*["Actual data"] + (len(y))*["Prediction"])
  results_df = pd.DataFrame(data=results_dict)

  fig = go.Figure()
  # first I add a trace for every x
  pred_df = results_df.loc[results_df['origin'] == "Prediction"]
  actual_df = results_df.loc[results_df['origin'] == "Actual data"]
  fig.add_trace(go.Scatter(x=actual_df["index"],
                          y=actual_df["move_duration"],
                          mode="markers",
                          showlegend=False,
                          marker=dict(symbol="circle",
                                      color="blue",
                                      line=dict(width=0),
                                      size=9)))
  fig.add_trace(go.Scatter(x=pred_df["index"],
                          y=pred_df["move_duration"],
                          mode="markers",
                          showlegend=False,
                          marker=dict(symbol="x-thin-open",
                                      color="red",
                                      line=dict(width=1.5),
                                      size=9)))

  fig.update_layout(
      title=regressor_name, title_x=0.5,
      xaxis_title="Sorted From-To Bus Stop Test Samples")
  fig.update_layout(
    font_family="Times New Roman",
    font_size=30)
  fig.show()
  print("corrcoef",np.corrcoef(y, y_pred)[0,1])
  print("R-squared",np.corrcoef(y, y_pred)[0,1]**2)
  print("RMSE",np.sqrt(mean_squared_error(y, y_pred)))
#show_pred_vs_actual(regressor, X_test, y_test, "Decision Tree Regressor")

In [None]:
def show_pred_vs_actual_ANN(regressor, X, y, regressor_name):
  y_pred = regressor.predict(X)
  results_dict = {
      "index": [],
      "move_duration": [],
      "origin": []
  }
  results_dict["index"] = (2*[*range(len(y))])
  results_dict["move_duration"] = (y.tolist() + \
                                  y_pred[:,0].tolist())
  results_dict["origin"] = (len(y)*["Actual data"] + (len(y))*["Prediction"])
  results_df = pd.DataFrame(data=results_dict)
  #len(results_dict["move"])

  fig = go.Figure()
  # first I add a trace for every x
  pred_df = results_df.loc[results_df['origin'] == "Prediction"]
  actual_df = results_df.loc[results_df['origin'] == "Actual data"]
  fig.add_trace(go.Scatter(x=actual_df["index"],
                          y=actual_df["move_duration"],
                          mode="markers+lines",
                          showlegend=False,
                          line=dict(width=1),
                          marker=dict(symbol="circle",
                                      color="blue",
                                      line=dict(width=0),
                                      size=9)))
  fig.add_trace(go.Scatter(x=pred_df["index"],
                          y=pred_df["move_duration"],
                          mode="markers+lines",
                          showlegend=False,
                          line=dict(width=1),
                          marker=dict(symbol="x-thin-open",
                                      color="red",
                                      line=dict(width=1),
                                      size=9)))

  fig.update_layout(
      title=regressor_name, title_x=0.5,
      xaxis_title="Random Test Samples",
      yaxis_title="Duration (seconds)")
  fig.update_layout(
    font_family="Times New Roman",
    font_size=30)
#  fig.update_yaxes(range=[0, 800])
  fig.show()
  print("corrcoef",np.corrcoef(y, y_pred[:,0])[0,1])
  print("R-squared",np.corrcoef(y, y_pred[:,0])[0,1]**2)
  print("RMSE",np.sqrt(mean_squared_error(y, y_pred)))

In [None]:
X = np.array(t_df.iloc[:, [0, 2, 4]])
y = np.array(t_df.iloc[:, 8]).ravel()
X2 = np.array(t_df.loc[t_df['previous_previous_move_duration'] > 0].iloc[:,9:35])
y2 = np.array(t_df.loc[t_df['previous_previous_move_duration'] > 0].iloc[:, 8]).ravel()
X3 = np.array(t_df.loc[t_df['previous_previous_move_duration'] > 0].iloc[:,[0,2,4,9,10]])
y3 = np.array(t_df.loc[t_df['previous_previous_move_duration'] > 0].iloc[:, 8]).ravel()
X4 = np.array(t_df.loc[t_df['previous_move_duration'] > 0].iloc[:,np.r_[9,11:35]])
y4 = np.array(t_df.loc[t_df['previous_move_duration'] > 0].iloc[:, 8]).ravel()
X5 = np.array(t_df.loc[t_df['previous_move_duration'] > 0].iloc[:,[0,2,4,9]])
y5 = np.array(t_df.loc[t_df['previous_move_duration'] > 0].iloc[:, 8]).ravel()
X6 = np.array(t_df.loc[t_df['previous_previous_move_duration'] > 0].iloc[:,9:19])
y6 = np.array(t_df.loc[t_df['previous_previous_move_duration'] > 0].iloc[:, 8]).ravel()
X7 = np.array(t_df.loc[t_df['previous_previous_move_duration'] > 0].iloc[:,[0, 9, 10]])
y7 = np.array(t_df.loc[t_df['previous_previous_move_duration'] > 0].iloc[:, 8]).ravel()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

tf.keras.utils.set_random_seed(0)

#X_train_full, X_test, y_train_full, y_test = train_test_split(X3, y3, random_state=1)
scaler = StandardScaler()

X_train_full, X_test, y_train_full, y_test = train_test_split(X7, y7, test_size=0.3)
X_valid, X_train = X_train_full[:int(len(X_train_full)/2)], X_train_full[int(len(X_train_full)/2):]
y_valid, y_train = y_train_full[:int(len(X_train_full)/2)], y_train_full[int(len(X_train_full)/2):]
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)
import tensorflow as tf
from tensorflow import keras
assert tf.__version__ >= "2.0"

model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=[3]))
model.add(keras.layers.Dense(256, activation="tanh"))
model.add(keras.layers.Dense(64, activation="tanh"))
model.add(keras.layers.Dense(1, activation="relu"))

model.summary()

model.compile(loss="mean_squared_error",
              optimizer="sgd")
history = model.fit(X_train, y_train, epochs=248,
                    validation_data=(X_valid, y_valid))
show_pred_vs_actual_ANN(model, X_test, y_test, "ANN Predictive Model")

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_18 (Flatten)        (None, 3)                 0         
                                                                 
 dense_54 (Dense)            (None, 256)               1024      
                                                                 
 dense_55 (Dense)            (None, 64)                16448     
                                                                 
 dense_56 (Dense)            (None, 1)                 65        
                                                                 
Total params: 17537 (68.50 KB)
Trainable params: 17537 (68.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/248
Epoch 2/248
Epoch 3/248
Epoch 4/248
Epoch 5/248
Epoch 6/248
Epoch 7/248
Epoch 8/248
Epoch 9/248
Epoch 10/248
Epoch 11/248
Epoch 12/248
Ep

corrcoef 0.7719388978464607
R-squared 0.5958896620084085
RMSE 114.74315261822923


In [None]:
print(history.history.keys())
arr = np.array(history.history["loss"])
print(min(arr))
print(np.where(arr == min(arr))[0])
px.line(x=[*range(len(arr))], y=arr).show()

dict_keys(['loss', 'val_loss'])
6398.72412109375
[247]


In [None]:
from sklearn.tree import _tree

def tree_to_code(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("def tree({}):".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print ("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)


def tree_to_code_javascript(tree, feature_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]
    print ("function tree({}) {{".format(", ".join(feature_names)))

    def recurse(node, depth):
        indent = "  " * depth
        mini_indent = "  " * (depth-1)
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print ("{}if ({} <= {}) {{".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
#            print ("{}else {{ // if {} > {}".format(indent, name, threshold))
            print ("{}else {{".format(indent))
            recurse(tree_.children_right[node], depth + 1)
            print ("{}}}".format(mini_indent))
        else:
            print ("{}return {} \n{}}}".format(indent, tree_.value[node][0][0], mini_indent))

    recurse(0, 1)

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12, test_size=0.2)
regressor.fit(X_train, y_train)
show_pred_vs_actual(regressor, X_test, y_test, "Decision Tree Regressor")
#tree_to_code_javascript(regressor,["move_no", "day_no", "time_no"])

corrcoef 0.9262777920858472
R-squared 0.8579905481114319
RMSE 68.88184056980958


In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=300,random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12, test_size=0.3)
regressor.fit(X_train, y_train)
show_pred_vs_actual(regressor, X_test, y_test, "Random Forest Regressor")

corrcoef 0.9206798954811948
R-squared 0.8476514699432638
RMSE 71.02111519177701


In [None]:
from sklearn.neighbors import KNeighborsRegressor
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=12, test_size=0.3)
regressor = KNeighborsRegressor(n_neighbors=6, metric='minkowski', p=2,
                      metric_params={'w': np.array([7,1,1])})
regressor.fit(X_train, y_train)
show_pred_vs_actual(regressor, X_test, y_test, "K-Nearest Neighbors Regressor")

corrcoef 0.9234130475111654
R-squared 0.8526916563138578
RMSE 69.2468176003839


In [None]:
px.scatter(t_df, x = 'time', y = 'move_duration', color = 'move').show()
#px.scatter(t_df, x = 'time', y = 'y_pred', color = 'move').show()

In [None]:
t_df.iloc[:,8:32]

Unnamed: 0,move_duration,previous_move_duration,previous_previous_move_duration,1 -> 2,2 -> 3,3 -> 4,4 -> 5,5 -> 6,6 -> 7,7 -> 8,...,Wednesday,Thusday,Friday,Saturday,time_A,time_B,time_C,time_D,time_E,time_F
0,925,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1538,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,502,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,504,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,340,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1204,252,300,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1205,252,96,942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1206,140,396,470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1207,119,101,715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
