In [1]:
#Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

#Load Dataset
data=pd.read_csv('./seeds.csv')
data.head()

data.columns=['Area', 'Perimeter', 'Compactness', 'Kernel_Length','Kernel_Width', 'Asymmetry_Coeff','Kernel_Groove' , 'Type']
data.head()

data.isnull().sum()

data.Type.value_counts()

#Dataset Info
data.info()

#Tukey Method

# import required libraries
from collections import Counter

# Outlier detection 
def detect_outliers(df,n,features):
    
    outlier_indices = []
    
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        
        # outlier step
        outlier_step = 1.5 * IQR
        
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
        
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers   

# List of Outliers
Outliers_to_drop = detect_outliers(data.drop('Type',axis=1),0,list(data.drop('Type',axis=1)))
data.drop('Type',axis=1).loc[Outliers_to_drop]

#Create New Dataset without Outliers
good_data = data.drop(data.index[Outliers_to_drop]).reset_index(drop = True)
good_data.info()

#Show Key Statistics
good_data.describe()

#Create Profile Report
    
#Importing package
import pandas_profiling as pp
from IPython.display import IFrame
 
# Profile Report
FinalReport = pp.ProfileReport(good_data)
FinalReport.to_file('ReportFinalProjChannel-W22.html')
display(IFrame('ReportFinalProjChannel-W22.html', width=900, height=350))

good_data["Category"].replace({1 :"Kama" , 2 :"Rosa" , 3 :"Canadian"} , inplace = True)
good_data.head()

#Create x and y variables
x = good_data.drop('Type', axis=1).to_numpy()
Y = good_data['Type'].to_numpy()

#Create Train and Test Dataset
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,Y,test_size = 0.2,stratify=Y,random_state = 100)

#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_trainfs = sc.fit_transform(x_train)
x_testfs = sc.transform(x_test)

#Feature Selection using SelectFromModel
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(solver='lbfgs',class_weight='balanced',max_iter=1000,random_state=100)
clf.fit(x_testfs,y_test)
model = SelectFromModel(clf, prefit=True)
feature_idx = model.get_support()
feature_name = good_data.drop('Type',axis=1).columns[feature_idx]
print('\nKey Features:',feature_name)

#Prepare for Models for Comparison

#Create x and y variables
x2 = good_data[feature_name].to_numpy()
Y2 = good_data['Type'].to_numpy()

#Create Train and Test Datasets (New Dataset)
from sklearn.model_selection import train_test_split
x_train2,x_test2,y_train2,y_test2 = train_test_split(x2,Y2,test_size = 0.2,stratify=Y2,random_state = 100)

#Fix the imbalanced Classes
from imblearn.over_sampling import SMOTE
smt=SMOTE(random_state=100)
x_train_smt,y_train_smt = smt.fit_resample(x_train2,y_train2)

#Scale the Data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train3 = sc.fit_transform(x_train_smt)
x_test3 = sc.transform(x_test2)

#Scale the All x-variables to be used with Voting Ensemble

x_2 = sc.transform(x2)

#Class Balance - Test Data
print('Train Data - Class Split')
num_zeros = (y_train_smt == "Kama").sum()
num_ones = (y_train_smt == "Rosa").sum()
num_two = (y_train_smt == "Canadian").sum()
print('"Kama" -',  num_zeros)
print('"Rosa" -',  num_ones)
print('"Canadian" -',  num_two)

#Construct some pipelines 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

#Create Pipeline

pipeline =[]

pipe_logreg = Pipeline([('scl', StandardScaler()),
                    ('clf', LogisticRegression(solver='lbfgs',class_weight='balanced',
                                               random_state=100))])
pipeline.insert(0,pipe_logreg)

pipe_rdf = Pipeline([('scl', StandardScaler()),
                    ('clf', RandomForestClassifier(n_estimators=100,random_state=100))])
pipeline.insert(1,pipe_rdf)

# Set grid search params 

modelpara =[]

param_gridlogreg = {'clf__C': [0.01, 0.1, 1, 10, 100], 
                    'clf__penalty': ['l1', 'l2']}
modelpara.insert(0,param_gridlogreg)


param_gridrdf = {
            'clf__n_estimators': [100,150,200],
            'clf__max_features': ['auto', 'log2', 'sqrt'],
            'clf__bootstrap': [True, False]}
modelpara.insert(1,param_gridrdf)

#Define Plot for learning curve
%matplotlib inline
from sklearn.model_selection import learning_curve

def plot_learning_curves(model):
    train_sizes, train_scores, test_scores = learning_curve(estimator=model,
                                                            X=x_train_smt, 
                                                            y=y_train_smt,
                                                            train_sizes= np.linspace(0.1, 1.0, 10),
                                                            cv=10,
                                                            scoring='accuracy',random_state=100)
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    
    plt.plot(train_sizes, train_mean,color='blue', marker='o', 
             markersize=5, label='training accuracy')
    plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std,
                     alpha=0.15, color='blue')

    plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5,
             label='validation accuracy')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std,
                     alpha=0.15, color='green')
    plt.grid(True)
    plt.xlabel('Number of training samples')
    plt.ylabel('Accuracy')
    plt.legend(loc='best')
    plt.ylim([0.5, 1.01])
    plt.show()

#Plot Learning Curve
print('Logistic Regression - Learning Curve')
plot_learning_curves(pipe_logreg)
print('\nRandom Forest - Learning Curve')
plot_learning_curves(pipe_rdf)

#Model Analysis
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score

models=[]
models.append(('Logistic Regression',pipe_logreg))
models.append(('Random Forest',pipe_rdf))

#Model Evaluation
results =[]
names=[]
scoring ='accuracy'
print('Model Evaluation - Accuracy Score')
for name, model in models:
    rkf=RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
    cv_results = cross_val_score(model,x2,Y2,cv=rkf,scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print('{} {:.2f} +/- {:.2f}'.format(name,cv_results.mean(),cv_results.std()))
print('\n')

#Boxpot View
fig = plt.figure(figsize=(10,5))
fig.suptitle('Boxplot View')
ax = fig.add_subplot(111)
sns.boxplot(data=results)
ax.set_xticklabels(names)
plt.ylabel('Accuracy')
plt.xlabel('Model')
plt.show()

#Define Gridsearch Function

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix  

def Gridsearch_cv(model, params):
    
    #Cross-validation Function
    cv2=RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
        
        
    #GridSearch CV
    gs_clf = GridSearchCV(model, params, cv=cv2,scoring='accuracy')
    gs_clf = gs_clf.fit(x_train3, y_train_smt)
    model = gs_clf.best_estimator_
    
    # Use best model and test data for final evaluation
    y_pred = model.predict(x_test3)
    #Identify Best Parameters to Optimize the Model
    bestpara=str(gs_clf.best_params_)
    
    #Output Heading
    print('\nOptimized Model')
    print('\nModel Name:',str(pipeline.named_steps['clf']))
        
    #Output Validation Statistics
    target_names=['Outcome 0','Outcome 1','Outcome 2']
    print('\nBest Parameters:',bestpara)
    print('\n', confusion_matrix(y_test2,y_pred))  
    print('\n',classification_report(y_test2,y_pred,target_names=target_names)) 

#Run Models

for pipeline, modelpara in zip(pipeline,modelpara):
    Gridsearch_cv(pipeline,modelpara)

final_model = RandomForestClassifier(random_state=100, bootstrap= True, max_features= 'auto', n_estimators= 100)
final_model.fit(x_train3, y_train_smt)

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Making predictions
y_pred_final = final_model.predict(x_test3)

# Printing classification report
print('\n', confusion_matrix(y_test2,y_pred_final))
print('\n', classification_report(y_test2,y_pred_final))

# Create a Pickle file  
import pickle
pickle_out = open("model.pkl","wb")
pickle.dump(final_model, pickle_out)
pickle_out.close()

# end

# # -*- coding: utf-8 -*-

# import numpy as np
# import pickle
# import pandas as pd
# from flask import Flask, request
# from flask import Flask, request, jsonify, render_template

# app=Flask(__name__)
# pickle_in = open("model.pkl","rb")
# model=pickle.load(pickle_in)

# @app.route('/')
# def home():
#     return render_template('index.html')



# @app.route('/predict',methods=['POST'])
# def predict():
#     '''
#     For rendering results on HTML GUI
#     '''
#     int_features = [x for x in request.form.values()]
#     final_features = [np.array(int_features)]
#     prediction = model.predict(final_features)

    
#     return render_template('index.html', prediction_text='Wheat Kernal is {}'.format(prediction))
    
    


# if __name__=='__main__':
#     app.run()

# #Prepare Models 
# from sklearn import model_selection
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import precision_score, recall_score
# from sklearn.linear_model import LogisticRegression

# #Model Analysis
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import cross_val_score

# models=[]
# models.append(('Logistic Regression',LogisticRegression(solver='lbfgs',class_weight='balanced',
#                                                         random_state=100)))
# models.append(('Random Forest',RandomForestClassifier(n_estimators=100,
#                                                       random_state=100)))
# models.append(('Bagging Classifier',BaggingClassifier(random_state=100)))
# models.append(('AdaBoost',AdaBoostClassifier(random_state=100)))
# models.append(('GBC',GradientBoostingClassifier(random_state=100)))

# #Model Evaluation
# results =[]
# names=[]
# scoring ='accuracy'
# print('Model Evaluation - Accuracy')
# for name, model in models:
#     rkf=RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
#     cv_results = cross_val_score(model,x2,Y2,cv=rkf,scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     print('{} {:.2f} +/- {:.2f}'.format(name,cv_results.mean(),cv_results.std()))
# print('\n')

# #Boxpot View
# fig = plt.figure(figsize=(15,10))
# fig.suptitle('Boxplot View')
# ax = fig.add_subplot(111)
# sns.boxplot(data=results)
# ax.set_xticklabels(names)
# plt.ylabel('Accuracy')
# plt.xlabel('Model')
# plt.show()

# #Script for Models

# from sklearn.metrics import classification_report, confusion_matrix  

# models2 ={'Logistic Regression':LogisticRegression(solver='lbfgs',
#                                                    class_weight='balanced',random_state=100),
#           'Random Forest':RandomForestClassifier(n_estimators=100,random_state=100),
#           'Bagging Classifier':BaggingClassifier(random_state=100),
#           'AdaBoost':AdaBoostClassifier(random_state=100),
#           'GBC':GradientBoostingClassifier(random_state=100)}

# for name, model in models2.items():
#     model.fit(x_train3,y_train_smt)
#     predict = model.predict(x_test3)
#     print('\nEstimator: {}'.format(name)) 
#     print('\n',confusion_matrix(y_test2,predict))  
#     print(classification_report(y_test2,predict))  

# #Create Voting Model - Sklearn
# from sklearn.ensemble import VotingClassifier
# from sklearn.model_selection import RepeatedKFold
# from sklearn.model_selection import cross_validate

# estimators = []

# model1 = LogisticRegression(solver='lbfgs',class_weight='balanced',
#                             random_state=100)
# estimators.append(('Logistic', model1))

# model2 = RandomForestClassifier(n_estimators=100,random_state=100)
# estimators.append(('Random Forest', model2))

# voting_clf=VotingClassifier(estimators,voting='soft')

# scoring = {'acc': 'accuracy',
#            'prec_macro': 'precision_macro',
#            'rec_macro': 'recall_macro'}
# print('\nVoting Model')
# for clf in (model1,model2,voting_clf):
#     rkfcv= clf.fit(x_train3,y_train_smt)
#     ens_rkf1 = RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
#     rKFcv = cross_validate(rkfcv, x_2, Y2, scoring=scoring, cv=ens_rkf1)
#     print(clf.__class__.__name__,round(rKFcv['test_rec_macro'].mean(),2))   

# #Create Stacking Model-Sklearn
# from sklearn.ensemble import StackingClassifier

# #Identify Models
# lr = LogisticRegression(solver='lbfgs',class_weight='balanced',
#                         random_state=100)

# estimators2 = []

# mod1 = RandomForestClassifier(n_estimators=100,random_state=100)
# estimators2.append(('Random Forest', mod1))

# mod2 = BaggingClassifier(random_state=100)
# estimators2.append(('Bagging', mod2))

# #Create Stacking Classifier
# stackmod=StackingClassifier(estimators=estimators2,
#                              final_estimator=lr)

# scoring2 = {'acc': 'accuracy',
#            'prec_macro': 'precision_macro',
#            'rec_macro': 'recall_macro'}

# print('\nStacking Model')
# for clf in (mod1,mod2,stackmod):
#     rkfcv2= clf.fit(x_train3,y_train_smt)
#     ens_rkf2 = RepeatedKFold(n_splits=10, n_repeats=5, random_state=100)
#     rKFcv2 = cross_validate(rkfcv2, x_2, Y2, scoring=scoring2, cv=ens_rkf2)
#     print(clf.__class__.__name__,round(rKFcv2['test_rec_macro'].mean(),2))  

