In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit #import to have equal weigtage samples in training dataset
from sklearn.tree import DecisionTreeRegressor # import for Decision Tree Algorithm
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR #import for support vector regressor
from sklearn.metrics import mean_squared_error  # import to calculate root mean square

In [18]:
SourceData=pd.read_excel("Supplier Past Data.xlsx") # Load the data into Pandas DataFrame

In [19]:
SourceData["PO Category"]=pd.cut(SourceData["Purchase Order Value"],
                                     bins=[0., 30000, 60000, 90000,
np.inf],                                     
labels=[1, 2, 3, 4])

In [20]:
split = StratifiedShuffleSplit(n_splits=2, test_size=0.3)

In [21]:
for train_index, test_index in split.split(SourceData, SourceData["PO Category"]):
    strat_train_set = SourceData.loc[train_index]  # stratfied train dataset 
    strat_test_set = SourceData.loc[test_index] #stratified test dataset

In [22]:
for set_ in (strat_train_set, strat_test_set): 
    set_.drop("PO Category", axis=1, inplace=True)

In [23]:
SourceData_train_independent= strat_train_set.drop(["Defect Percent"], axis=1)
SourceData_train_dependent=strat_train_set["Defect Percent"].copy()
SourceData_test_independent= strat_test_set.drop(["Defect Percent"], axis=1)
SourceData_test_dependent=strat_test_set["Defect Percent"].copy()

In [24]:
sc_X = StandardScaler()
X_train=sc_X.fit_transform(SourceData_train_independent.values)
y_train=SourceData_train_dependent
pickle.dump(sc_X, open("Scaler.sav", 'wb'))
X_test=sc_X.fit_transform(SourceData_test_independent.values)
y_test=SourceData_test_dependent

In [25]:
svm_reg = SVR(kernel="linear", C=1)
svm_reg.fit(X_train, y_train)
filename = 'SVR_TrainedModel.sav'
pickle.dump(svm_reg, open(filename, 'wb'),protocol=-1)

In [26]:
decision_predictions = svm_reg.predict(X_train)
Score = (svm_reg.score(X_train, y_train))  # It provides the R-Squared Value
print ( "The score of the Support  Vector model is", round(Score,2))
lin_mse = mean_squared_error(y_train, decision_predictions)
print("MSE  of  Vector  model is ", round(lin_mse,2))
lin_rmse = mean_squared_error(y_train, decision_predictions, squared=False)
print("RMSE of  Support  Vector  Learning model is ", round(lin_rmse,2))

The score of the Support  Vector model is 0.05
MSE  of  Vector  model is  0.22
RMSE of  Support  Vector  Learning model is  0.47


In [27]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train, y_train)
filename = 'DecisionTree_TrainedModel.sav'
pickle.dump(tree_reg, open(filename, 'wb'),protocol=-1)
predictions = tree_reg.predict(X_train) 
Score = (tree_reg.score(X_train, y_train))  # It provides the R-Squared Value
print ( "The score of model Decision Tree model is ", round(Score,2))
lin_mse = mean_squared_error(y_train, predictions)
print("MSE of Decision Tree model is ", round(lin_mse,2))
lin_rmse = mean_squared_error(y_train, decision_predictions, squared=False)
print("RMSE of Decision Tree model is ", round(lin_rmse,2))

The score of model Decision Tree model is  1.0
MSE of Decision Tree model is  0.0
RMSE of Decision Tree model is  0.47


In [28]:
test_predictions = tree_reg.predict(X_test)
test_decision_predictions = svm_reg.predict(X_test)

In [29]:
import pickle
import pandas as pd 
testdata=pd.read_excel("Defect Predict.xlsx") # Load the test data
sc_X = pickle.load(open('Scaler.sav', 'rb'))  # Load the pickle
loaded_model = pickle.load(open('DecisionTree_TrainedModel.sav', 'rb')) # load the trained model
X_test=sc_X.transform(testdata.values) # scale the independent variables for test data
decision_predictions = loaded_model.predict(X_test) # Predict the value of dependent variable
print("The prediction by Decision Treemodel is " , decision_predictions )

The prediction by Decision Treemodel is  [0.26]
