In [20]:
# Required Python Packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
 
import pdb
 
# File Paths
#OUTPUT_PATH = "/home/supradha/PycharmProjects/Flaskhello/dataset/breast-cancer-wisconsin.csv"
 
# Headers
HEADERS = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion",
           "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "CancerType"]
 
 
def read_data(path):
    """
    Read the data into pandas dataframe
    :param path:
    :return:
    """
    data = pd.read_csv("/home/supradha/PycharmProjects/Flaskhello/dataset/breast-cancer-wisconsin.csv")
    return data
 
 
def get_headers(dataset):
    """
    dataset headers
    :param dataset:
    :return:
    """
    return dataset.columns.values
 
 
def add_headers(dataset, headers):
    """
    Add the headers to the dataset
    :param dataset:
    :param headers:
    :return:
    """
    dataset.columns = headers
    return dataset
 
 
def split_dataset(dataset, train_percentage, feature_headers, target_header):
    """
    Split the dataset with train_percentage
    :param dataset:
    :param train_percentage:
    :param feature_headers:
    :param target_header:
    :return: train_x, test_x, train_y, test_y
    """
 
    # Split dataset into train and test dataset
    train_x, test_x, train_y, test_y = train_test_split(dataset[feature_headers], dataset[target_header],
                                                        train_size=train_percentage)
    #print (dataset[feature_headers])
    return train_x, test_x, train_y, test_y
 
 
def handel_missing_values(dataset, missing_values_header, missing_label):
    """
    Filter missing values from the dataset
    :param dataset:
    :param missing_values_header:
    :param missing_label:
    :return:
    """
 
    return dataset[dataset[missing_values_header] != missing_label]
 
 
def random_forest_classifier(features, target):
    """
    To train the random forest classifier with features and target data
    :param features:
    :param target:
    :return: trained random forest classifier
    """
    clf = RandomForestClassifier()
    clf.fit(features, target)
    return clf
 
 
def dataset_statistics(dataset):
    """
    Basic statistics of the dataset
    :param dataset: Pandas dataframe
    :return: None, print the basic statistics of the dataset
    """
    #print (dataset.describe())
 
 
def main():
    """
    Main function
    :return:
    """
    # Load the csv file into pandas dataframe
    dataset = pd.read_csv("/home/supradha/PycharmProjects/Flaskhello/dataset/breast-cancer-wisconsin.csv")
    # Get basic statistics of the loaded dataset
    dataset_statistics(dataset)
 
    # Filter missing values
    dataset = handel_missing_values(dataset, HEADERS[6], '?')
    train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[1:-1], HEADERS[-1])
 
    # Train and Test dataset size details
    print ("Train_x Shape :: ", train_x.shape)
    print ("Train_y Shape :: ", train_y.shape)
    print ("Test_x Shape :: ", test_x.shape)
    print ("Test_y Shape :: ", test_y.shape)
 
    # Create random forest classifier instance
    trained_model = random_forest_classifier(train_x, train_y)
    #print ("Trained model :: ", trained_model)
    test_x = [[1,1,1,1,2,10,3,1,1]]
    print (test_x)
    predictions = trained_model.predict(test_x)
    print (predictions)
 
    for i in range(0, 5):
        print ("Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]))
 
    print ("Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)))
    print ("Test Accuracy  :: ", accuracy_score(test_y, predictions))
    print (" Confusion matrix ", confusion_matrix(test_y, predictions))
 
 
if __name__ == "__main__":
    main()

Train_x Shape ::  (477, 9)
Train_y Shape ::  (477,)
Test_x Shape ::  (205, 9)
Test_y Shape ::  (205,)
[[1, 1, 1, 1, 2, 10, 3, 1, 1]]
[2]
Actual outcome :: 2 and Predicted outcome :: 2




IndexError: index 1 is out of bounds for axis 0 with size 1