In [46]:
import sys
import math

# !{sys.executable} -m pip install --upgrade pip

try:
    import numpy as np
except ModuleNotFoundError:
    !{sys.executable} -m pip install numpy
    import numpy as np
    

try:
    import pandas as pd
except ModuleNotFoundError:
    !{sys.executable} -m pip install pandas
    import pandas as pd

    
try:
    import matplotlib
    import matplotlib.pyplot as plt
except ModuleNotFoundError:
    !{sys.executable} -m pip install matplotlib
    import matplotlib.pyplot as plt

    
try:
    import scipy
    import scipy.io
except ModuleNotFoundError:
    !{sys.executable} -m pip install scipy
    import scipy

        
try:
    import sklearn
except ModuleNotFoundError:
    !{sys.executable} -m pip install sklearn
    import sklearn

try:
    import mat4py
except ModuleNotFoundError:
    !{sys.executable} -m pip install mat4py
    import mat4py

        
try:
    import keras
except ModuleNotFoundError:
    !{sys.executable} -m pip install keras
    import keras
    

try:
    import keras_metrics
except ModuleNotFoundError:
    !{sys.executable} -m pip install keras_metrics
    import keras_metrics

from keras.models import Sequential
from keras.layers import Dense
import keras_metrics
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

## Configure Environment

In [47]:

"""
Make plot outputs appear and be stored within the notebook
"""
%matplotlib inline

"""
fix random seed for reproducibility
"""
np.random.seed(7)

## Inspect Data

In [48]:
orginal_data = pd.read_csv("all-data.csv", header=0)
orginal_data = orginal_data.sample(frac=0.1)
orginal_data.head()

Unnamed: 0,QUEUE_TIME,PROCESS_TIME,TPT,OPERATION,QTY,DOW,SHIFT,SHIFT_TYPE,OWNER,BUILD_PRIORITY,BUILD_CATEGORY,BUILD_TYPE,PLATFORM_NAME,PRODUCT_FAMILY,PACKAGE_NAME,PKGDEVREVSTEP,PRODUCT
277583,1.259111,2.406667,3.665778,1377,197,3,1,0,5,2,4,4,12,332,20,2866,3413
389595,0.000444,0.000444,0.000889,7563,754,7,3,0,5,3,4,4,13,225,36,1976,2653
1137333,5.151111,15.660444,20.811556,6435,123,3,1,0,7,3,4,2,31,108,48,1626,444
1028785,1.735111,0.034222,1.769333,2147,129,4,3,0,4,3,2,4,31,243,48,1163,1306
426655,1.153778,40.688,41.841778,1966,98,4,0,1,7,3,4,2,31,38,14,2438,253


## Create DataFrame with Dummies

In [49]:
y_column = ['TPT']
feature_columns = ['BUILD_PRIORITY', 'BUILD_CATEGORY', 'BUILD_TYPE', 'PLATFORM_NAME', 'OPERATION']

subset_features = orginal_data[feature_columns].copy()

dummified_data = orginal_data[['QTY', 'TPT']].copy()

for i in range(len(feature_columns)):
     col = feature_columns[i]
     dumm = pd.get_dummies(subset_features[col], prefix=col)
     dummified_data = pd.concat([dummified_data, dumm], axis=1)

dummified_data.head()

Unnamed: 0,QTY,TPT,BUILD_PRIORITY_1,BUILD_PRIORITY_2,BUILD_PRIORITY_3,BUILD_PRIORITY_4,BUILD_CATEGORY_1,BUILD_CATEGORY_2,BUILD_CATEGORY_3,BUILD_CATEGORY_4,...,OPERATION_9192,OPERATION_9194,OPERATION_9374,OPERATION_9375,OPERATION_9376,OPERATION_9384,OPERATION_9386,OPERATION_9894,OPERATION_9951,OPERATION_10158
277583,197,3.665778,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
389595,754,0.000889,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1137333,123,20.811556,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1028785,129,1.769333,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
426655,98,41.841778,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


## Parition data to train/test sets

In [50]:
from sklearn.model_selection import train_test_split

X = dummified_data.drop('TPT', axis=1)
y = dummified_data[['TPT']]
# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

## Linear Regression

In [52]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [53]:
score = regression_model.score(X_test, y_test)
print("score is {}".format(score))

score is 0.5418528507387479


In [54]:
from sklearn.metrics import mean_squared_error

y_predict = regression_model.predict(X_test)

regression_model_mse = mean_squared_error(y_predict, y_test)

print("regression_model_mse is {}".format(regression_model_mse))

regression_model_mse is 1711.6148799418195


In [55]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for QTY is 0.0006952626670347989
The coefficient for BUILD_PRIORITY_1 is -2.072320480880337
The coefficient for BUILD_PRIORITY_2 is -4.815305183472296
The coefficient for BUILD_PRIORITY_3 is 3.3171161313366504
The coefficient for BUILD_PRIORITY_4 is 3.5705095330587167
The coefficient for BUILD_CATEGORY_1 is 0.041709901782715164
The coefficient for BUILD_CATEGORY_2 is 3.0924426849575237
The coefficient for BUILD_CATEGORY_3 is -4.0492953662021005
The coefficient for BUILD_CATEGORY_4 is 0.9151427794896101
The coefficient for BUILD_TYPE_1 is 1.2065018205790454
The coefficient for BUILD_TYPE_2 is -0.4439103637745294
The coefficient for BUILD_TYPE_3 is -2.6828849356783984
The coefficient for BUILD_TYPE_4 is 1.9202934789006418
The coefficient for PLATFORM_NAME_1 is -4.049351736095726
The coefficient for PLATFORM_NAME_2 is -0.006061620828355707
The coefficient for PLATFORM_NAME_3 is -4.829204224919149
The coefficient for PLATFORM_NAME_4 is -6.779398297521618
The coefficient for