In [1]:
import sys
import math

# !{sys.executable} -m pip install --upgrade pip

try:
    import numpy as np
except ModuleNotFoundError:
    !{sys.executable} -m pip install numpy
    import numpy as np
    

try:
    import pandas as pd
except ModuleNotFoundError:
    !{sys.executable} -m pip install pandas
    import pandas as pd

    
try:
    import matplotlib
    import matplotlib.pyplot as plt
except ModuleNotFoundError:
    !{sys.executable} -m pip install matplotlib
    import matplotlib.pyplot as plt

    
try:
    import scipy
    import scipy.io
except ModuleNotFoundError:
    !{sys.executable} -m pip install scipy
    import scipy

        
try:
    import sklearn
except ModuleNotFoundError:
    !{sys.executable} -m pip install sklearn
    import sklearn

try:
    import mat4py
except ModuleNotFoundError:
    !{sys.executable} -m pip install mat4py
    import mat4py

        
try:
    import keras
except ModuleNotFoundError:
    !{sys.executable} -m pip install keras
    import keras
    

try:
    import keras_metrics
except ModuleNotFoundError:
    !{sys.executable} -m pip install keras_metrics
    import keras_metrics

from keras.models import Sequential
from keras.layers import Dense
import keras_metrics
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Configure Environment

In [2]:

"""
Make plot outputs appear and be stored within the notebook
"""
%matplotlib inline

"""
fix random seed for reproducibility
"""
np.random.seed(7)

## Inspect Data

In [3]:
orginal_data = pd.read_csv("eee-all-data.csv", header=0)

# filter to operation 1377
orginal_data = orginal_data.loc[orginal_data['OPERATION'] == 2312]

print("Original data is length: {}".format(orginal_data['TPT'].count()))

# keep only the ones that are within +3 to -3 standard deviations in the column 'TPT'
orginal_data = orginal_data[np.abs(orginal_data['TPT']-orginal_data['TPT'].mean()) <= (0.5*orginal_data['TPT'].std())]

print("Filtered data is length: {}".format(orginal_data['TPT'].count()))

orginal_data.head()

Original data is length: 3225
Filtered data is length: 3019


Unnamed: 0,TPT,OPERATION,QTY,DOW,SHIFT,SHIFT_TYPE,OWNER,BUILD_PRIORITY,BUILD_CATEGORY,BUILD_TYPE,PLATFORM_NAME,PRODUCT_FAMILY,PACKAGE_NAME,PKGDEVREVSTEP
418791,0.136444,2312,221,5,2,1,4,3,2,4,11,208,36,2191
418793,0.097333,2312,122,5,2,1,4,3,2,4,11,208,36,1174
421719,0.027556,2312,80,5,3,0,3,3,2,1,11,208,36,2191
424062,0.061778,2312,120,5,3,0,3,3,2,1,11,208,36,2191
424440,0.003111,2312,121,5,3,0,3,3,2,1,11,208,36,2191


## Create DataFrame with Dummies

In [4]:
y_column = ['TPT']

nominal_features = ['BUILD_PRIORITY', 'BUILD_CATEGORY', 'BUILD_TYPE', 'PLATFORM_NAME']

continuous_features = ['QTY']

subset_features = orginal_data[nominal_features].copy()

dummified_data = orginal_data[continuous_features + y_column].copy()

for i in range(len(nominal_features)):
     col = nominal_features[i]
     dumm = pd.get_dummies(subset_features[col], prefix=col)
     dummified_data = pd.concat([dummified_data, dumm], axis=1)

dummified_data.head()

Unnamed: 0,QTY,TPT,BUILD_PRIORITY_1,BUILD_PRIORITY_2,BUILD_PRIORITY_3,BUILD_PRIORITY_4,BUILD_CATEGORY_1,BUILD_CATEGORY_2,BUILD_CATEGORY_3,BUILD_CATEGORY_4,...,PLATFORM_NAME_7,PLATFORM_NAME_11,PLATFORM_NAME_16,PLATFORM_NAME_24,PLATFORM_NAME_26,PLATFORM_NAME_27,PLATFORM_NAME_28,PLATFORM_NAME_32,PLATFORM_NAME_37,PLATFORM_NAME_38
418791,221,0.136444,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
418793,122,0.097333,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
421719,80,0.027556,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
424062,120,0.061778,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
424440,121,0.003111,0,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


## Parition data to train/test sets

In [5]:
from sklearn.model_selection import train_test_split

X = dummified_data.drop('TPT', axis=1)
y = dummified_data[['TPT']]
# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

## Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [7]:
score = regression_model.score(X_test, y_test)
print("score is {}".format(score))

score is 0.17201628219474996


In [8]:
from sklearn.metrics import mean_squared_error

y_predict = regression_model.predict(X_test)

regression_model_mse = mean_squared_error(y_predict, y_test)

print("regression_model_mse is {}".format(regression_model_mse))

regression_model_mse is 0.07378838869929301


In [9]:
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for QTY is 8.634944541446332e-05
The coefficient for BUILD_PRIORITY_1 is 0.01428458082486344
The coefficient for BUILD_PRIORITY_2 is -0.18407451846555692
The coefficient for BUILD_PRIORITY_3 is 0.07738469852682878
The coefficient for BUILD_PRIORITY_4 is 0.09240523911386475
The coefficient for BUILD_CATEGORY_1 is -0.1976899782111577
The coefficient for BUILD_CATEGORY_2 is 0.05723880480280818
The coefficient for BUILD_CATEGORY_3 is 0.09127836095226669
The coefficient for BUILD_CATEGORY_4 is 0.049172812456083126
The coefficient for BUILD_TYPE_1 is -0.013834263675677883
The coefficient for BUILD_TYPE_2 is -0.02076628802379474
The coefficient for BUILD_TYPE_4 is 0.034600551699472536
The coefficient for PLATFORM_NAME_7 is -0.0240279573293219
The coefficient for PLATFORM_NAME_11 is -0.15082688693993307
The coefficient for PLATFORM_NAME_16 is -0.2990043054873561
The coefficient for PLATFORM_NAME_24 is 0.22175970789825566
The coefficient for PLATFORM_NAME_26 is 0.532554281752865