# Load and parse data

In [18]:
import pandas as pd
import numpy as np

from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import DecisionTree

from sklearn.preprocessing import LabelEncoder

In [19]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
["id", "diagnosis"]
features_per_nucleus = ["radius", "texture", "perimeter", "area", "smoothness", "compactness",
        "concavity", "concave_points", "symmetry", "fractal_dimension"]
features = []
for nucleus_id in range(1,4):
    features.extend([feature + "_" + str(nucleus_id) for feature in features_per_nucleus])
names = ["id", "diagnosis"] + features
dataset = pd.read_csv(url, names=names)
dataset = dataset.drop('id', 1)

le = LabelEncoder()
dataset["diagnosis"] = le.fit_transform(dataset["diagnosis"])

In [20]:
# TODO: solve this more elegant
home_dir = "/smartdata/uxcyy/"
dataset.to_csv(home_dir + "breast-cancer-wisconsin.txt", sep=",", index=False, header=None, na_rep=np.nan)

# Create Labeled Points

In [21]:
points = sc.textFile("/smartdata/uxcyy/" + "breast-cancer-wisconsin.txt")
points.take(4)

[u'1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189',
 u'1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902',
 u'1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758',
 u'1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173']

In [27]:
def parse_point(line):
    """Parse a line of text into MLlib LabeledPoint object"""
    values = [float(s) for s in line.strip().split(",")]
    return LabeledPoint(values[0], values[1:])

parsed_data = points.map(parse_point)

In [29]:
parsed_data.take(4)

[LabeledPoint(1.0, [17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189]),
 LabeledPoint(1.0, [20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902]),
 LabeledPoint(1.0, [19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758]),
 LabeledPoint(1.0, [11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173])]

# Train-Test-Split

In [30]:
(training_data, test_data) = parsed_data.randomSplit([0.7, 0.3])

model = DecisionTree.trainClassifier(training_data, numClasses=2, categoricalFeaturesInfo={}, impurity="gini",
                                    maxDepth=3, maxBins=32)

KeyboardInterrupt: 