In [1]:
from sklearn.pipeline import Pipeline
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn import tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from time import time
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, roc_auc_score, log_loss
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import graphviz
import pickle

pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
def preprocessdataframe (df):
    imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    imputer = imputer.fit(df.loc[:,['Age']])
    df.loc[:,'Age'] = imputer.transform(df.loc[:,['Age']])

    imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
    imputer = imputer.fit(df.loc[:,['Fare']])
    df.loc[:,'Fare'] = imputer.transform(df.loc[:,['Fare']])

    df.Embarked = df.Embarked.fillna('S')

    df = pd.get_dummies(data=df, columns=['Embarked', 'Pclass', 'Sex'])

    return df

In [3]:
def showroccurve(fpr, tpr, roc_auc, label, color):
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color=color,
             lw=lw, label='ROC curve - {0} (area = {1:0.2f})'.format(label, roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [4]:
def showmultiroccurve(params): #this should be a list of dictionaries of fpr, tpr, roc_auc, label, and color
    plt.figure()
    lw = 2
    for param in params:
        plt.plot(param["fpr"], param["tpr"], color=param["color"],
             lw=lw, label='ROC curve - {0} (area = {1:0.2f})'.format(param["label"],param["roc_auc"]))
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()

In [5]:
def showdecisiontree(model, feature_names, name):
    dot_data = tree.export_graphviz(model, out_file=None,
         feature_names=feature_names,
         class_names=['Did not survive', 'Survived'],
         filled=True, rounded=True,
         special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.render(name)
    return True

In [6]:
def pickle_save(name, item):
    PIK = str(name)+ ".pickle"
    with open(PIK,"wb") as f:
        pickle.dump(item, f)




In [7]:
def pickle_load(name):
    PIK = str(name) + ".pickle"
    with open(PIK,"rb") as f:
        temp_item = pickle.load(f)
    return temp_item

In [8]:
model = pickle_load("mymodel")

In [9]:
def label_probability (row):
   if row['output'] == 0 :
      return row['no_probability']
   return row['yes_probability']

In [10]:
df = pd.read_csv('~/Documents/GitHub/TiberDataScienceLearning/Data/Titanic/test.csv')
x = df[['Pclass','Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y_predictions = model.predict(x)
y_probabilities = model.predict_proba(x)
x['output'] = y_predictions
x['no_probability'] = y_probabilities[:,0]
x['yes_probability'] = y_probabilities[:,1]
x['probability'] = x.apply (lambda row: label_probability (row),axis=1)
x = x.drop(['no_probability', 'yes_probability'], axis=1)

In [11]:
x

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,output,probability
0,3,male,34.5,0,0,7.8292,Q,0,0.813150
1,3,female,47.0,1,0,7.0000,S,1,0.518283
2,2,male,62.0,0,0,9.6875,Q,0,0.740192
3,3,male,27.0,0,0,8.6625,S,0,0.816938
4,3,female,22.0,1,1,12.2875,S,1,0.570444
5,3,male,14.0,0,0,9.2250,S,0,0.804592
6,3,female,30.0,0,0,7.6292,Q,1,0.524462
7,2,male,26.0,1,1,29.0000,S,0,0.699599
8,3,female,18.0,0,0,7.2292,C,1,0.533733
9,3,male,21.0,2,0,24.1500,S,0,0.775399


In [12]:
x.to_csv("outputfile.csv")