# Pre-post performance models
In this file we are creating models that are based on the dataset that only contain
individual parameters and the preperformance and the postperformance.

In [16]:
from helpers import print_mean_squared_error, print_coefficient_of_determination, calculate_age
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from io import StringIO
import csv

import sys
sys.path.append('../recengine/data_parser.py')
from data_parser import ttr_data_from_reader


In [37]:
# Loads raw data and transforms.

individuals = pd.read_csv("./data/ogasawara_HL/individuals.csv", sep="|")
logs = pd.read_csv("./data/ogasawara_HL/logs.csv", sep="|")
pre_logs = pd.read_csv("./data/ogasawara_HL/pre_logs.csv", sep="|")

# Calculate TTR_DATA based on pre-logs.
ttr_data = {}
for p_id, group in pre_logs.groupby('ID'):
    group = group.drop(columns=['ID'])
    
    # Saving to buffer to be able to use with data parsers directly.
    buffer = StringIO()  #creating an empty buffer
    group.to_csv(buffer, index=False)  #filling that buffer
    buffer.seek(0) #set to the start of the stream
    
    data = ttr_data_from_reader(csv.reader(buffer), '%Y-%m-%d %H:%M:%S')    
    ttr_data[str(p_id)] = data
    


post_logs = {}
for p_id, group in logs.groupby('ID'):
    post_logs[str(p_id)] = group



headers = ["load_week1", "max_week1", "load_week2", "max_week2", "load_week3", "max_week3", "load_week4", "max_week4", "Performance"]

data = pd.DataFrame(columns=headers)

# Transform data
for index, ind in individuals.iterrows():
    p_id = str(ind.get("ID"))
    
    ttr = ttr_data.get(p_id)    
    # Take last 4 weeks.
    entry = ttr[-8:]

    postperformance = post_logs.get(p_id)["Performance"].values[-1]
    entry.append(postperformance)
    
    data = data.append(pd.Series(entry, index=data.columns), ignore_index=True)
    
data.head()


Unnamed: 0,load_week1,max_week1,load_week2,max_week2,load_week3,max_week3,load_week4,max_week4,Performance
0,6480.0,96.0,6480.0,96.0,6480.0,96.0,6480.0,96.0,133.920162
1,7290.0,108.0,7290.0,108.0,7290.0,108.0,7290.0,108.0,146.629836
2,6817.5,101.0,6817.5,101.0,6817.5,101.0,6817.5,101.0,139.230566
3,7155.0,106.0,7155.0,106.0,7155.0,106.0,7155.0,106.0,144.519665
4,6345.0,94.0,6345.0,94.0,6345.0,94.0,6345.0,94.0,131.789482


In [38]:
# Split into training set and testing set.

# Shuffle the dataset.
data_shuffled = data.sample(frac=1.0, random_state=0)

# Split into input part X and output part Y.
X = data_shuffled.drop('Performance', axis=1)

Y = data_shuffled['Performance']
Y.head()

# Partition the data into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)


## Regular linear regression

In [39]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt


#Träna modellen
lr = LinearRegression()
lr.fit(Xtrain,Ytrain)

#Använd på testsettet
Ypred = lr.predict(Xtest)

lr_mean_squared_error = mean_squared_error(Ytest, Ypred)
lr_r2_score = r2_score(Ytest, Ypred)

print_mean_squared_error(lr_mean_squared_error)
print_coefficient_of_determination(lr_r2_score)

Mean squared error: 0.00
Coefficient of determination: 1.00


## Ridge regression

In [40]:
#Ridge regression
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=1.0)
ridge.fit(Xtrain, Ytrain)
# Ridge()
ridge_Ypred = ridge.predict(Xtest)


ridge_mean_squared_error = mean_squared_error(Ytest, ridge_Ypred)
ridge_r2_score = r2_score(Ytest, ridge_Ypred)

print_mean_squared_error(ridge_mean_squared_error)
print_coefficient_of_determination(ridge_r2_score)

Mean squared error: 0.00
Coefficient of determination: 1.00


## Lasso regression

In [41]:
from sklearn import linear_model
lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(Xtrain,Ytrain)
lasso_Ypred = lasso.predict(Xtest)

lasso_mean_squared_error = mean_squared_error(Ytest, lasso_Ypred)
lasso_r2_score = r2_score(Ytest, lasso_Ypred)

print_mean_squared_error(lasso_mean_squared_error)
print_coefficient_of_determination(lasso_r2_score)

Mean squared error: 0.00
Coefficient of determination: 1.00


## Summaries of the results

In [42]:
print("Normal linear regression:")
print_mean_squared_error(lr_mean_squared_error)
print_coefficient_of_determination(lr_r2_score)
print("\n")

print("Ridge regression:")
print_mean_squared_error(ridge_mean_squared_error)
print_coefficient_of_determination(ridge_r2_score)
print("\n")

print("Lasso regression:")
print_mean_squared_error(lasso_mean_squared_error)
print_coefficient_of_determination(lasso_r2_score)
print("\n")

plt.scatter(Xtest["Preperformance"], Ytest, color='black')
plt.plot(Xtest["Preperformance"], Ypred, color='blue', linewidth=1)
plt.plot(Xtest["Preperformance"], ridge_Ypred, color='green', linewidth=1)
plt.plot(Xtest["Preperformance"], lasso_Ypred, color='red', linewidth=1)
plt.xticks(())
plt.yticks(())
plt.show()

Normal linear regression:
Mean squared error: 0.00
Coefficient of determination: 1.00


Ridge regression:
Mean squared error: 0.00
Coefficient of determination: 1.00


Lasso regression:
Mean squared error: 0.00
Coefficient of determination: 1.00




KeyError: 'Preperformance'

# Save model
Saves the choosen model(s) to a file to be used in the recommendation engine.

In [43]:
import pickle

# In the current state of recommendation engine, it is important to name the model after
# the program the data comes from. (i.e the name of the csv file in simulator/training_programs)
# Should probably make this automatic in some way.

filename = 'ogasawara_HL.sav'
pickle.dump(ridge, open(filename, 'wb'))