In [51]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, fbeta_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np

# Random Forest

In [34]:
data = pd.read_csv("data/data_s_annotated.csv")
data['time'] = pd.to_datetime(data['time'])

start_train = pd.Timestamp('2024-07-19').date()
end_train = pd.Timestamp('2024-07-23').date()
data_train = data[(data['time'].dt.date >= start_train) & (data['time'].dt.date <= end_train)]

start_val = pd.Timestamp('2024-07-24').date()
end_val = pd.Timestamp('2024-07-24').date()
data_val = data[(data['time'].dt.date >= start_val) & (data['time'].dt.date <= end_val)]

X_train = data_train[['xl', 'yl', 'zl', 'xr', 'yr', 'zr']]
y_train = data_train['activity']

X_val = data_val[['xl', 'yl', 'zl', 'xr', 'yr', 'zr']]
y_val = data_val['activity']

pipeline = Pipeline([
    ('clf', RandomForestClassifier(n_estimators=100, random_state=1234))
])

print("Training model...")
pipeline.fit(X_train, y_train)

print("Predicting...")
y_pred = pipeline.predict(X_val)

print(fbeta_score(y_val, y_pred, average='micro', beta=1/3))

data_val = data_val.drop(columns=['xr', 'yr', 'zr', 'xl', 'yl', 'zl', 'activity'])
data_val['activity'] = y_pred
data_val = data_val[(data_val['time'].dt.time >= pd.Timestamp('07:00:00').time()) & (data_val['time'].dt.time <= pd.Timestamp('19:00:00').time())]
data_val['time'] = data_val['time'].dt.strftime('%H:%M:%S')
data_val.head(10)
# data_val.to_csv("data/data_val.csv", index=False)

Training model...
Predicting...
0.8047685185185186


Unnamed: 0,time,hour,activity
370800,07:00:00,7,none
370801,07:00:01,7,none
370802,07:00:02,7,none
370803,07:00:03,7,none
370804,07:00:04,7,none
370805,07:00:05,7,none
370806,07:00:06,7,none
370807,07:00:07,7,none
370808,07:00:08,7,none
370809,07:00:09,7,none


# XGBoost (evaluation on day 5)

In [48]:
data = pd.read_csv("data/data_s_annotated.csv")
data['time'] = pd.to_datetime(data['time'])

start_train = pd.Timestamp('2024-07-20').date()
end_train = pd.Timestamp('2024-07-23').date()
data_train = data[(data['time'].dt.date >= start_train) & (data['time'].dt.date <= end_train)]

start_val = pd.Timestamp('2024-07-24').date()
end_val = pd.Timestamp('2024-07-24').date()
data_val = data[(data['time'].dt.date >= start_val) & (data['time'].dt.date <= end_val)]

X_train = data_train[['xl', 'yl', 'zl', 'xr', 'yr', 'zr']]
y_train = data_train['activity']

X_val = data_val[['xl', 'yl', 'zl', 'xr', 'yr', 'zr']]
y_val = data_val['activity']

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.transform(y_val)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', XGBClassifier(n_estimators=100, random_state=1234, use_label_encoder=False, eval_metric='logloss'))
])

print("Training model...")
pipeline.fit(X_train, y_train)

print("Predicting...")
y_pred = pipeline.predict(X_val)

print(fbeta_score(y_val, y_pred, average='micro', beta=1/3))

data_val = data_val.drop(columns=['xr', 'yr', 'zr', 'xl', 'yl', 'zl', 'activity'])
data_val['activity'] = label_encoder.inverse_transform(y_pred)
data_val = data_val[(data_val['time'].dt.time >= pd.Timestamp('07:00:00').time()) & (data_val['time'].dt.time <= pd.Timestamp('19:00:00').time())]
data_val['time'] = data_val['time'].dt.strftime('%H:%M:%S')
data_val.head(10)

Training model...


Parameters: { "use_label_encoder" } are not used.



Predicting...
0.8300925925925926


Unnamed: 0,time,activity
370800,07:00:00,none
370801,07:00:01,none
370802,07:00:02,none
370803,07:00:03,none
370804,07:00:04,none
370805,07:00:05,none
370806,07:00:06,none
370807,07:00:07,none
370808,07:00:08,none
370809,07:00:09,none


# XGBoost (full training)

In [23]:
# take a sample of the data to faster training
# data = data.sample(frac=0.01, random_state=1234)

start_train = pd.Timestamp('2024-07-20').date()
end_train = pd.Timestamp('2024-07-24').date()
data_train = data[(data['time'].dt.date >= start_train) & (data['time'].dt.date <= end_train)]

start_val = pd.Timestamp('2024-07-25').date()
end_val = pd.Timestamp('2024-07-25').date()
data_val = data[(data['time'].dt.date >= start_val) & (data['time'].dt.date <= end_val)]

X_train = data_train[['xl', 'yl', 'zl', 'xr', 'yr', 'zr']]
y_train = data_train['activity']

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)

pipeline = Pipeline([
    ('clf', XGBClassifier(n_estimators=100, random_state=1234, use_label_encoder=False, eval_metric='logloss'))
])

print("Training model...")
pipeline.fit(X_train, y_train)

print("Predicting...")
y_pred = pipeline.predict(X_val)

data_val = data_val.drop(columns=['xr', 'yr', 'zr', 'xl', 'yl', 'zl', 'activity'])
data_val['activity'] = label_encoder.inverse_transform(y_pred)
data_val = data_val[(data_val['time'].dt.time >= pd.Timestamp('07:00:00').time()) & (data_val['time'].dt.time <= pd.Timestamp('19:00:00').time())]
data_val['time'] = data_val['time'].dt.strftime('%H:%M:%S')
data_val.head(10)
# data_val.to_csv("data/data_val.csv", index=False)

Training model...


Parameters: { "use_label_encoder" } are not used.



Predicting...


Unnamed: 0,time,activity
457200,07:00:00,none
457201,07:00:01,none
457202,07:00:02,none
457203,07:00:03,none
457204,07:00:04,none
457205,07:00:05,none
457206,07:00:06,none
457207,07:00:07,none
457208,07:00:08,none
457209,07:00:09,none


In [25]:
data_val.to_csv("result.csv", index=False)