In [37]:
import sys
import math

# !{sys.executable} -m pip install --upgrade pip

try:
    import numpy as np
except ModuleNotFoundError:
    !{sys.executable} -m pip install numpy
    import numpy as np
    

try:
    import pandas as pd
except ModuleNotFoundError:
    !{sys.executable} -m pip install pandas
    import pandas as pd

    
try:
    import matplotlib
    import matplotlib.pyplot as plt
except ModuleNotFoundError:
    !{sys.executable} -m pip install matplotlib
    import matplotlib.pyplot as plt

    
try:
    import scipy
    import scipy.io
except ModuleNotFoundError:
    !{sys.executable} -m pip install scipy
    import scipy

        
try:
    import sklearn
except ModuleNotFoundError:
    !{sys.executable} -m pip install sklearn
    import sklearn

try:
    import mat4py
except ModuleNotFoundError:
    !{sys.executable} -m pip install mat4py
    import mat4py

        
try:
    import keras
except ModuleNotFoundError:
    !{sys.executable} -m pip install keras
    import keras
    

try:
    import keras_metrics
except ModuleNotFoundError:
    !{sys.executable} -m pip install keras_metrics
    import keras_metrics

from keras.models import Sequential
from keras.layers import Dense
import keras_metrics
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

## Configure Environment

In [38]:

"""
Make plot outputs appear and be stored within the notebook
"""
%matplotlib inline

"""
fix random seed for reproducibility
"""
np.random.seed(7)

## Inspect Data

In [39]:
orginal_data = pd.read_csv("eee-all-data.csv", header=0)
orginal_data = orginal_data.sample(frac=0.25)
orginal_data.head()

Unnamed: 0,TPT,OPERATION,QTY,DOW,SHIFT,SHIFT_TYPE,OWNER,BUILD_PRIORITY,BUILD_CATEGORY,BUILD_TYPE,PLATFORM_NAME,PRODUCT_FAMILY,PACKAGE_NAME,PKGDEVREVSTEP
1104135,0.349778,7568,38,5,2,1,5,3,4,4,9,401,6,663
89872,0.191111,5824,91,5,3,0,5,3,4,4,10,304,20,2530
260866,0.320444,7236,224,5,3,0,5,4,2,1,33,306,44,63
317357,0.476444,7399,873,2,0,1,3,3,2,1,11,308,36,2189
432861,4.565778,1383,233,3,0,1,4,3,2,4,32,183,45,99


## Create DataFrame with Dummies

In [40]:
y_column = ['TPT']
# feature_columns = ['BUILD_PRIORITY', 'BUILD_CATEGORY', 'BUILD_TYPE', 'PLATFORM_NAME', 'OPERATION']
feature_columns = ['BUILD_PRIORITY', 'BUILD_CATEGORY', 'BUILD_TYPE', 'PLATFORM_NAME']

subset_features = orginal_data[feature_columns].copy()

dummified_data = orginal_data[[ 'TPT']].copy()

for i in range(len(feature_columns)):
     col = feature_columns[i]
     dumm = pd.get_dummies(subset_features[col], prefix=col)
     dummified_data = pd.concat([dummified_data, dumm], axis=1)

dummified_data.head()

Unnamed: 0,TPT,OPERATION_224,OPERATION_225,OPERATION_226,OPERATION_227,OPERATION_228,OPERATION_229,OPERATION_230,OPERATION_231,OPERATION_232,...,OPERATION_8778,OPERATION_8859,OPERATION_8940,OPERATION_9081,OPERATION_9374,OPERATION_9375,OPERATION_9376,OPERATION_9384,OPERATION_9385,OPERATION_9387
1104135,0.349778,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
89872,0.191111,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
260866,0.320444,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
317357,0.476444,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
432861,4.565778,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Parition data to train/test sets

In [41]:
from sklearn.model_selection import train_test_split

X = dummified_data.drop('TPT', axis=1)
y = dummified_data[['TPT']]
# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

## Linear Regression

In [42]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [43]:
score = regression_model.score(X_test, y_test)
print("score is {}".format(score))

score is -1.792170404560703e+18


In [44]:
from sklearn.metrics import mean_squared_error

y_predict = regression_model.predict(X_test)

regression_model_mse = mean_squared_error(y_predict, y_test)

print("regression_model_mse is {}".format(regression_model_mse))

regression_model_mse is 2.051222821200951e+21


In [45]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for OPERATION_224 is 36811415852.39878
The coefficient for OPERATION_225 is 36811416175.08716
The coefficient for OPERATION_226 is 36811415782.31146
The coefficient for OPERATION_227 is 36811415775.87561
The coefficient for OPERATION_228 is 36811415774.620636
The coefficient for OPERATION_229 is 36811415823.87985
The coefficient for OPERATION_230 is 36811415770.01117
The coefficient for OPERATION_231 is 36811415775.631714
The coefficient for OPERATION_232 is 36811415775.10309
The coefficient for OPERATION_234 is 36811415770.57637
The coefficient for OPERATION_235 is 36811415778.904465
The coefficient for OPERATION_236 is 36811415778.12984
The coefficient for OPERATION_237 is 36811415773.65515
The coefficient for OPERATION_239 is 36811415787.0051
The coefficient for OPERATION_240 is 36811415804.25587
The coefficient for OPERATION_241 is 36811415838.994644
The coefficient for OPERATION_242 is 36811415794.579704
The coefficient for OPERATION_244 is 36811415777.499405
The c