In [12]:
import numpy as np
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ShuffleSplit, RandomizedSearchCV, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

## Data Loading
The process begins with loading data from a CSV file to construct a more comprehensive dataset. By amalgamating these datasets, we obtain a more representative understanding of the interplay between position, angle, and force. As a final preprocessing step, the data is shuffled to ensure that records pertaining to various stages of the process are evenly distributed throughout the dataset.

### Database Integration
In a production environment, integrating data directly from a database, such as cosmos, would offer a more realistic approach. However, for the purposes of this proof of concept, loading data from CSV files suffices. For implementing connectivity with a Cosmos database, refer to cosmos_db.py for implementation details.

In [14]:
test1 = pd.read_csv('../Data/Test1.csv', index_col=0)
test2 = pd.read_csv('../Data/Test2.csv', index_col=0)
test4 = pd.read_csv('../Data/Test4.csv', index_col=0)
data = pd.concat([test1, test2, test4])
data = data.sample(frac = 1) #shuffle 
data.head()

Unnamed: 0_level_0,a_enc_1,b_enc_1,c_enc_1,x_enc_1,y_enc_1,z_enc_1,a_enc_2,b_enc_2,c_enc_2,x_enc_2,y_enc_2,z_enc_2,fx_1,fy_1,fz_1,fx_2,fy_2,fz_2
t,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1637007054,-89.999949,-0.000313,0.000571,403.691828,535.703207,-15.265022,89.996715,-8.76e-05,-179.997043,402.733536,526.879111,-9.249013,195.851627,700.862789,2272.493579,-20.89767,-820.901978,-691.302715
1637364591,-89.998545,-0.001711,0.001242,732.241567,963.003913,-9.724641,89.994824,-0.0003658044,-179.999032,737.930625,954.618004,-5.093665,-16.649961,126.550927,1362.583693,308.920205,-358.628951,-1145.754042
1637019452,-90.002485,0.002356,-0.000844,382.504611,593.069055,-104.341551,89.992617,-0.007542273,-179.998099,381.107773,587.128276,-97.583986,447.04565,1605.510239,2811.482857,-467.63761,-783.784249,-405.843465
1636583337,-89.999645,-0.0018,0.004494,376.16494,27.351262,-25.497072,90.003448,0.003140992,-179.988438,374.986422,20.438438,-19.432294,352.437806,1426.181479,2602.769042,-131.434927,-1056.908587,-590.154674
1637379781,-89.994951,-0.0005,0.00405,578.725266,1232.896786,-99.497846,89.994,6.63e-07,179.990949,578.620074,1236.18391,-96.252571,158.203885,-756.167025,2305.732647,72.419643,578.286766,-771.433535


## Data Size
Although 60K+ records is large it is possible that in order to adequately represent the soulution space we may need to create more records. More on this later. 

In [13]:
data.shape

(69239, 18)

## Train and Test Split
Set aside 20% of our data for testing and use the remining 80% for training our models. 

### Force 


In [15]:
# train test split 
train = data.sample(frac = 0.8)
test = data.drop(train.index)

X_train = train.drop(['fx_1', 'fy_1', 'fz_1', 'fx_2', 'fy_2', 'fz_2'], axis=1)
y_train = train[['fx_1', 'fy_1', 'fz_1', 'fx_2', 'fy_2', 'fz_2']]
print(f"Training data shape X {X_train.shape} and y {y_train.shape}")

X_test = test.drop(['fx_1', 'fy_1', 'fz_1', 'fx_2', 'fy_2', 'fz_2'], axis=1)
y_test = test[['fx_1', 'fy_1', 'fz_1', 'fx_2', 'fy_2', 'fz_2']]
print(f"Testing data shape X {X_test.shape} and y {y_test.shape}")


Training data shape X (55391, 12) and y (55391, 6)
Testing data shape X (13848, 12) and y (13848, 6)


## Model Selection
To effectively evaluate various models suitable for our dataset, we employ the cross_val_score method. This method provides a rapid means of assessing model performance using only training data. While not infallible, it serves as a valuable guide in identifying the most promising model.

### Cross-Validation
Given that training occurs solely on training data, there exists a risk of overfitting to this dataset. Cross-validation emerges as a crucial technique to address this concern. By partitioning the data into subsets and systematically training and evaluating the model on different combinations of these subsets, cross-validation offers insights into the model's generalization performance.

### Multivariate Regression Methods
Multivariate Multiple Regression stands as a robust technique for modeling multiple responses or dependent variables using a single set of predictor variables. In our context, the dependent variables encompass various forces, specifically denoted as each robot's 'fx', 'fy', and 'fz'. By leveraging multivariate regression, we aim to discern the relationships between these forces and the predictors, thereby gaining deeper insights into the underlying dynamics.

#### LinearRegression

#### KNeighborsRegressor

#### RandomForestRegressor

In [17]:
cv = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)
model = LinearRegression()
cross_val_score(model, X_train, y_train, cv=cv)

array([0.59474899, 0.59437967, 0.59211034, 0.58922309, 0.59797591])

In [18]:
cv = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)
model = KNeighborsRegressor()
cross_val_score(model, X_train, y_train, cv=cv)

array([0.94133056, 0.93990339, 0.93973551, 0.93845501, 0.94144368])

In [16]:
cv = ShuffleSplit(n_splits=5, test_size=.2, random_state=0)
model = RandomForestRegressor(max_depth=10, random_state=0)
cross_val_score(model, X_train, y_train, cv=cv)

array([0.91763274, 0.91977705, 0.91790199, 0.9168839 , 0.91788091])

## Hyper parameter search 

In [8]:
random_grid = {'bootstrap': [True, False],
               'max_depth': [10, 20, 30],
               'min_samples_leaf': [1, 2, 4],
               'min_samples_split': [2, 5, 10],
               'n_estimators': [130, 180, 230]}

rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=0, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=230; total time= 4.8min
[CV] END bootstrap=True, max_depth=30, min_samples_leaf=4, min_samples_split=2, n_estimators=230; total time= 3.2min
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=1, min_samples_split=5, n_estimators=230; total time= 3.3min
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=130; total time= 2.0min
[CV] END bootstrap=False, max_depth=10, min_samples_leaf=2, min_samples_split=5, n_estimators=130; total time= 2.3min
[CV] END bootstrap=False, max_depth=30, min_samples_leaf=2, min_samples_split=5, n_estimators=130; total time= 3.5min
[CV] END bootstrap=True, max_depth=20, min_samples_leaf=4, min_samples_split=10, n_estimators=130; total time= 1.7min
[CV] END bootstrap=False, max_depth=20, min_samples_leaf=4, min_samples_split=5, n_estimators=180; total time= 3

In [7]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

## Pickel Model 

Too be used in API code 

In [11]:
filename = 'RandomForestRegressorBestEst.pkl'
rf_random = pickle.load(open(filename, 'rb'))

best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

AttributeError: 'RandomForestRegressor' object has no attribute 'best_estimator_'

## Model Evaluation 

In [23]:
predictions = best_random.predict(X_test)
# Mean squared error 
MSE = mean_squared_error(y_test, predictions)

# Root mean square error 
rms = np.sqrt(mean_squared_error(y_test, predictions))

# R squared
r2 = r2_score(y_test, predictions) 
r2

0.994163048495325