In [275]:
import sys
import math

# !{sys.executable} -m pip install --upgrade pip

try:
    import numpy as np
except ModuleNotFoundError:
    !{sys.executable} -m pip install numpy
    import numpy as np
    

try:
    import pandas as pd
except ModuleNotFoundError:
    !{sys.executable} -m pip install pandas
    import pandas as pd

    
try:
    import matplotlib
    import matplotlib.pyplot as plt
except ModuleNotFoundError:
    !{sys.executable} -m pip install matplotlib
    import matplotlib.pyplot as plt

    
try:
    import scipy
    import scipy.io
except ModuleNotFoundError:
    !{sys.executable} -m pip install scipy
    import scipy

        
try:
    import sklearn
except ModuleNotFoundError:
    !{sys.executable} -m pip install sklearn
    import sklearn

try:
    import mat4py
except ModuleNotFoundError:
    !{sys.executable} -m pip install mat4py
    import mat4py

        
try:
    import keras
except ModuleNotFoundError:
    !{sys.executable} -m pip install keras
    import keras
    

try:
    import keras_metrics
except ModuleNotFoundError:
    !{sys.executable} -m pip install keras_metrics
    import keras_metrics

from keras.models import Sequential
from keras.layers import Dense
import keras_metrics
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

## Configure Environment

In [276]:

"""
Make plot outputs appear and be stored within the notebook
"""
%matplotlib inline

"""
fix random seed for reproducibility
"""
np.random.seed(7)

## Inspect Data

In [277]:
orginal_data = pd.read_csv("eee-all-data.csv", header=0)

# filter to operation 1377
orginal_data = orginal_data.loc[orginal_data['OPERATION'] == 1377]

print("Original data is length: {}".format(orginal_data['TPT'].count()))

# keep only the ones that are within +3 to -3 standard deviations in the column 'TPT'
orginal_data = orginal_data[np.abs(orginal_data['TPT']-orginal_data['TPT'].mean()) <= (0.5*orginal_data['TPT'].std())]

print("Filtered data is length: {}".format(orginal_data['TPT'].count()))

orginal_data.head()

Original data is length: 23830
Filtered data is length: 21790


Unnamed: 0,TPT,OPERATION,QTY,DOW,SHIFT,SHIFT_TYPE,OWNER,BUILD_PRIORITY,BUILD_CATEGORY,BUILD_TYPE,PLATFORM_NAME,PRODUCT_FAMILY,PACKAGE_NAME,PKGDEVREVSTEP
7,11.830222,1377,103,6,2,1,3,3,2,1,10,206,11,1052
21,8.164889,1377,1223,6,2,1,7,2,4,2,31,353,7,330
75,3.956444,1377,143,6,2,1,4,3,2,4,31,205,34,2112
136,2.070667,1377,98,6,2,1,3,3,2,1,10,206,11,1052
174,0.076444,1377,80,6,2,1,4,3,2,4,31,205,34,2112


## Create DataFrame with Dummies

In [278]:
y_column = ['TPT']
# feature_columns = ['BUILD_PRIORITY', 'BUILD_CATEGORY', 'BUILD_TYPE', 'PLATFORM_NAME']
feature_columns = ['PACKAGE_NAME']

subset_features = orginal_data[feature_columns].copy()

dummified_data = orginal_data[['TPT']].copy()

for i in range(len(feature_columns)):
     col = feature_columns[i]
     dumm = pd.get_dummies(subset_features[col], prefix=col)
     dummified_data = pd.concat([dummified_data, dumm], axis=1)

dummified_data.head()

Unnamed: 0,TPT,PACKAGE_NAME_4,PACKAGE_NAME_7,PACKAGE_NAME_8,PACKAGE_NAME_10,PACKAGE_NAME_11,PACKAGE_NAME_12,PACKAGE_NAME_14,PACKAGE_NAME_17,PACKAGE_NAME_18,...,PACKAGE_NAME_40,PACKAGE_NAME_41,PACKAGE_NAME_42,PACKAGE_NAME_45,PACKAGE_NAME_46,PACKAGE_NAME_47,PACKAGE_NAME_48,PACKAGE_NAME_50,PACKAGE_NAME_51,PACKAGE_NAME_52
7,11.830222,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
21,8.164889,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
75,3.956444,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
136,2.070667,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
174,0.076444,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Parition data to train/test sets

In [279]:
from sklearn.model_selection import train_test_split

X = dummified_data.drop('TPT', axis=1)
y = dummified_data[['TPT']]
# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

## Linear Regression

In [280]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [281]:
score = regression_model.score(X_test, y_test)
print("score is {}".format(score))

score is -4.242762279244781e+21


In [282]:
from sklearn.metrics import mean_squared_error

y_predict = regression_model.predict(X_test)

regression_model_mse = mean_squared_error(y_predict, y_test)

print("regression_model_mse is {}".format(regression_model_mse))

regression_model_mse is 1.6282005598300825e+23


In [283]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for PACKAGE_NAME_4 is 2334317719369.8984
The coefficient for PACKAGE_NAME_7 is 2334317719368.4473
The coefficient for PACKAGE_NAME_8 is 2334317719368.2563
The coefficient for PACKAGE_NAME_10 is 2334317719372.369
The coefficient for PACKAGE_NAME_11 is 2334317719368.2476
The coefficient for PACKAGE_NAME_12 is 2334317719368.3374
The coefficient for PACKAGE_NAME_14 is 2334317719375.5312
The coefficient for PACKAGE_NAME_17 is 2334317719379.687
The coefficient for PACKAGE_NAME_18 is 2334317719373.222
The coefficient for PACKAGE_NAME_19 is 2334317719367.5205
The coefficient for PACKAGE_NAME_20 is 2334317719370.792
The coefficient for PACKAGE_NAME_21 is 2334317719368.6973
The coefficient for PACKAGE_NAME_22 is 2334317719371.399
The coefficient for PACKAGE_NAME_23 is 2334317719365.755
The coefficient for PACKAGE_NAME_24 is 2334317719367.646
The coefficient for PACKAGE_NAME_25 is 2334317719367.6934
The coefficient for PACKAGE_NAME_26 is 2334317719374.305
The coefficient for PACKA