In [250]:
import pandas as pd
import numpy as np
from pprint import pprint

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split   
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import matplotlib.pyplot as plt

In [161]:
jacket = pd.read_excel('data/sizing.xls')
pants = pd.read_excel('data/sizing.xls', sheet_name=1)

In [162]:
jacket.head()

Unnamed: 0,OrderNo,CLCustomerName,BehindClothLengthBody,FrontClothLengthBody,ShoulderWidthBody,BustBody,AbdomenBody,LeftSleeveLengthBody,RightSleeveLengthBody,WristBody,BehindClothLengthSetNumber,FrontClothLengthSetNumber,ShoulderWidthSetNumber,BustSetNumber,AbdomenSetNumber,HemSetNumber,LeftSleeveLengthSetNumber,RightSleeveLengthSetNumber,Wrist finish
0,XQ6370212,Aaron H.,72.0,76.0,49.0,109.0,90.0,64.0,64.0,17.0,72.0,76.0,49.0,117.0,99.0,115.0,64.0,64.0,29.0
1,XQ6370288,Michael Pei,75.0,77.0,46.0,106.0,90.0,63.0,63.0,17.0,75.0,77.0,46.0,114.0,96.0,119.0,63.0,63.0,28.5
2,XQ6370398-T,aaron garcia,68.0,73.0,43.0,106.0,91.0,59.0,59.0,16.5,68.0,73.0,43.0,113.0,96.0,108.0,59.0,59.0,29.0
3,XQ6370144-T,Max Singletary,78.0,83.0,49.0,103.0,94.0,66.0,66.0,18.0,79.0,83.0,49.0,111.0,100.0,120.0,66.0,66.0,30.0
4,XQ6370648,Wesley Tsai,68.0,72.0,47.0,113.0,94.0,57.0,57.0,17.0,68.0,72.0,47.0,121.0,104.0,114.0,57.0,57.0,29.5


In [163]:
jacket.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   OrderNo                     554 non-null    object 
 1   CLCustomerName              552 non-null    object 
 2   BehindClothLengthBody       554 non-null    float64
 3   FrontClothLengthBody        554 non-null    float64
 4   ShoulderWidthBody           554 non-null    float64
 5   BustBody                    554 non-null    float64
 6   AbdomenBody                 554 non-null    float64
 7   LeftSleeveLengthBody        554 non-null    float64
 8   RightSleeveLengthBody       554 non-null    float64
 9   WristBody                   554 non-null    float64
 10  BehindClothLengthSetNumber  554 non-null    float64
 11  FrontClothLengthSetNumber   554 non-null    float64
 12  ShoulderWidthSetNumber      554 non-null    float64
 13  BustSetNumber               554 non

In [164]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

In [165]:
jacket_features = jacket[['BehindClothLengthBody', 'FrontClothLengthBody', 'ShoulderWidthBody', 'BustBody', 'AbdomenBody', \
                          'LeftSleeveLengthBody', 'RightSleeveLengthBody', 'WristBody']]

In [166]:
jacket_target = jacket[['BehindClothLengthSetNumber', 'FrontClothLengthSetNumber', 'ShoulderWidthSetNumber', 'BustSetNumber', 'AbdomenSetNumber', \
                       'HemSetNumber', 'LeftSleeveLengthSetNumber', 'RightSleeveLengthSetNumber', 'Wrist finish']]

In [167]:
X = jacket_features.drop(columns=['BehindClothLengthBody', 'RightSleeveLengthBody'])
y = jacket_target.drop(columns=['BehindClothLengthSetNumber', 'FrontClothLengthSetNumber', 'ShoulderWidthSetNumber', \
                                'LeftSleeveLengthSetNumber', 'RightSleeveLengthSetNumber'])

In [251]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.15, random_state=1)

**RandomizedSearchCV**

In [252]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 5, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = [1,2,3,4,5,6]
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 4, 6]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 3]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': [1, 2, 3, 4, 5, 6],
 'min_samples_leaf': [1, 2, 3],
 'min_samples_split': [2, 4, 6],
 'n_estimators': [5, 115, 226, 336, 447, 557, 668, 778, 889, 1000]}


In [253]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   47.5s finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': [1, 2, 3, 4, 5, 6],
                                        'min_samples_leaf': [1, 2, 3],
                                        'min_samples_split': [2, 4, 6],
                                        'n_estimators': [5, 115, 226, 336, 447,
                                                         557, 668, 778, 889,
                                                         1000]},
                   random_state=42, verbose=2)

In [254]:
pprint(rf_random.best_params_)
print(rf_random.best_score_)

{'bootstrap': False,
 'max_depth': 30,
 'max_features': 2,
 'min_samples_leaf': 3,
 'min_samples_split': 6,
 'n_estimators': 447}
0.501179452985502


In [244]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {} degrees.'.format(np.mean(errors)))
    print('Accuracy = {}%.'.format(accuracy))
    
    return accuracy

In [256]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test);

Model Performance
Average Error: BustSetNumber       2.664854
AbdomenSetNumber    3.075505
HemSetNumber        5.250341
Wrist finish        0.753188
dtype: float64 degrees.
Accuracy = BustSetNumber       85.572585
AbdomenSetNumber    61.903699
HemSetNumber        92.992675
Wrist finish        97.199924
dtype: float64%.


In [257]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test);

Model Performance
Average Error: BustSetNumber       2.925251
AbdomenSetNumber    3.146062
HemSetNumber        5.806351
Wrist finish        0.922874
dtype: float64 degrees.
Accuracy = BustSetNumber       86.098635
AbdomenSetNumber    64.576997
HemSetNumber        89.978365
Wrist finish        96.142324
dtype: float64%.


In [258]:
print('Improvement of {}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy));

Improvement of BustSetNumber       0.614742
AbdomenSetNumber    4.318477
HemSetNumber       -3.241448
Wrist finish       -1.088067
dtype: float64%.


In [None]:
# Train and fit model                                                   
rf = RandomForestClassifier(random_state=0,n_jobs=-1)

rf.fit(X_train, y_train)
                                     
# Test Prediction
pred = rf.predict(X_test)
print(f'Accuracy score: {rf.score(X_test, y_test):.3}')

**RandomizedSearchCV on scaled data**

In [273]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [275]:
X_train_scaled.shape

(470, 6)

In [276]:
X_test_scaled.shape

(84, 6)

In [265]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train_scaled, y_train_scaled)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 162 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   45.0s finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': [1, 2, 3, 4, 5, 6],
                                        'min_samples_leaf': [1, 2, 3],
                                        'min_samples_split': [2, 4, 6],
                                        'n_estimators': [5, 115, 226, 336, 447,
                                                         557, 668, 778, 889,
                                                         1000]},
                   random_state=42, verbose=2)

In [270]:
base_model = RandomForestRegressor(n_estimators = 10, random_state = 42)
base_model.fit(X_train_scaled, y_train_scaled)
base_accuracy = evaluate(base_model, X_test_scaled, y_test_scaled);

ValueError: operands could not be broadcast together with shapes (84,4) (470,4) 

In [268]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test_scaled, y_test_scaled);

ValueError: operands could not be broadcast together with shapes (84,4) (470,4) 

In [None]:
print('Improvement of {}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy));