## Prostate size prediction

#### Joseph O'Malley

Notebook 4: Prostate Dataset 

#### Load in Prostate dataset

In [1]:
## load in data and display data frame

# import module(s) into namespace
import pandas as pd #we almost always need pandas because we like data frames
import numpy as np
pd.set_option('display.max_colwidth', 150) #important for getting all the text



##load in names
c=pd.read_csv('/Users/ultrajosef/Downloads/Macbook 2012 Files/BIA6303/data/prostate.csv'
              ,sep = ",")
c.shape
c.head()

Unnamed: 0,Obs,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
0,1,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783
1,2,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519
2,3,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519
3,4,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519
4,5,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564


In [2]:
## check datatypes
c.dtypes

Obs          int64
lcavol     float64
lweight    float64
age          int64
lbph       float64
svi          int64
lcp        float64
gleason      int64
pgg45        int64
lpsa       float64
dtype: object

In [3]:
### Replace nulls with median
c.fillna((c.median()), inplace=True)

In [4]:
## check distribution of numeric columns
c.describe()

Unnamed: 0,Obs,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa
count,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0
mean,49.0,1.35001,3.628943,63.865979,0.100356,0.216495,-0.179366,6.752577,24.381443,2.478387
std,28.145456,1.178625,0.428411,7.445117,1.450807,0.413995,1.39825,0.722134,28.204035,1.154329
min,1.0,-1.347074,2.374906,41.0,-1.386294,0.0,-1.386294,6.0,0.0,-0.430783
25%,25.0,0.512824,3.37588,60.0,-1.386294,0.0,-1.386294,6.0,0.0,1.731656
50%,49.0,1.446919,3.623007,65.0,0.300105,0.0,-0.798508,7.0,15.0,2.591516
75%,73.0,2.127041,3.876396,68.0,1.558145,0.0,1.178655,7.0,40.0,3.056357
max,97.0,3.821004,4.780383,79.0,2.326302,1.0,2.904165,9.0,100.0,5.582932


In [5]:
# move the column to head of list using index, pop and insert
cols = list(c)
cols.insert(0, cols.pop(cols.index('lpsa')))
cols

['lpsa',
 'Obs',
 'lcavol',
 'lweight',
 'age',
 'lbph',
 'svi',
 'lcp',
 'gleason',
 'pgg45']

In [6]:
# use ix to reorder
c = c.loc[:, cols]

# drop 
c=c.drop(['Obs'], axis=1)
c.head()

Unnamed: 0,lpsa,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45
0,-0.430783,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0
1,-0.162519,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0
2,-0.162519,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20
3,-0.162519,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0
4,0.371564,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0


### Create Test/Train Split of data

In [7]:
from sklearn.model_selection import train_test_split

# split dataset into testing and training (70/30)
features_train, features_test, target_train, target_test = train_test_split(
    c.iloc[:,1:].values, c.iloc[:,0].values, test_size=0.30, random_state=0)

In [8]:
print(features_test.shape)
print(features_train.shape)
print(target_test.shape)
print(target_train.shape)

(30, 8)
(67, 8)
(30,)
(67,)


### Model 1 - LASSO Regression

In [9]:
# Lasso Regression
# fit a LASSO model to the data
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.1, random_state=33)
model.fit(features_train, target_train)
print(model)
# make predictions
expected = target_test
predicted = model.predict(features_test)
# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print("AUC" ,mse)
print("R-squared" ,model.score(features_test, target_test))

Lasso(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=33,
   selection='cyclic', tol=0.0001, warm_start=False)
AUC 0.6442585177886749
R-squared 0.5746702269306728


In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, roc_auc_score

expected = target_test
predicted = model.predict(features_test)

## create threshold (median) for testing accuracy of classification
expected = np.where(expected > c.iloc[:,0].values.mean(), 1, 0)
predicted = np.where(predicted > c.iloc[:,0].values.mean(), 1, 0)

## output predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(expected, predicted)
print("AUC:" ,auc(false_positive_rate, true_positive_rate))
print(classification_report(expected, predicted))
print(confusion_matrix(expected, predicted))

AUC: 0.9027149321266968
             precision    recall  f1-score   support

          0       0.86      0.92      0.89        13
          1       0.94      0.88      0.91        17

avg / total       0.90      0.90      0.90        30

[[12  1]
 [ 2 15]]


In [11]:
## create Grid Search to optimize tuning parameters
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
import time

# use a full grid over all parameters
param_grid = {"alpha": [.001, .1, 1, 10, 100]}
start_time = time.clock()

# run grid search
grid_search = GridSearchCV(model, param_grid=param_grid,n_jobs=-1)
grid_search.fit(features_train, target_train)


#print(grid_search.cv_results_)
#print("SCORES", grid_search.cv_results_)
print("BEST SCORE", grid_search.best_score_)
print("BEST PARAM", grid_search.best_params_)
print("Time to run", time.clock() - start_time, "seconds")

BEST SCORE 0.489288571398629
BEST PARAM {'alpha': 0.001}
Time to run 0.0730869999999999 seconds


### Model 2 - Ridge Regression

In [12]:
# Ridge Regression
import numpy as np
from sklearn import datasets
from sklearn.linear_model import Ridge

# fit a ridge regression model to the data
model = Ridge(alpha=0.1, random_state=33)
model.fit(features_train, target_train)
print(model)
# make predictions
expected = target_test
predicted = model.predict(features_test)
# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print("MSE" ,mse)
print("R-squared" ,model.score(features_test, target_test))

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=33, solver='auto', tol=0.001)
MSE 0.6194699213834604
R-squared 0.5910352850441853


In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, roc_auc_score

expected = target_test
predicted = model.predict(features_test)

## check mean rate of violent crime
print("Average Rate of Violent Crime:" ,c.iloc[:,0].values.mean())

## create threshold (mean) for testing accuracy of classification
expected = np.where(expected > c.iloc[:,0].values.mean(), 1, 0)
predicted = np.where(predicted > c.iloc[:,0].values.mean(), 1, 0)

## output predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(expected, predicted)
print("AUC:" ,auc(false_positive_rate, true_positive_rate))
print(classification_report(expected, predicted))
print(confusion_matrix(expected, predicted))

Average Rate of Violent Crime: 2.4783868783505154
AUC: 0.9117647058823529
             precision    recall  f1-score   support

          0       0.81      1.00      0.90        13
          1       1.00      0.82      0.90        17

avg / total       0.92      0.90      0.90        30

[[13  0]
 [ 3 14]]


In [14]:
## create Grid Search to optimize tuning parameters
# use a full grid over all parameters
param_grid = {"alpha": [.001, .1, 1, 10, 100]}
start_time = time.clock()


# run grid search
grid_search = GridSearchCV(model, param_grid=param_grid,n_jobs=-1, cv=5)
grid_search.fit(features_train, target_train)


#print(grid_search.cv_results_)
#print("SCORES", grid_search.cv_results_)
print("BEST SCORE", grid_search.best_score_)
print("BEST PARAM", grid_search.best_params_)
print("Time to run", time.clock() - start_time, "seconds")

BEST SCORE 0.5372136888096438
BEST PARAM {'alpha': 1}
Time to run 0.18629700000000016 seconds


### Model 3 - ElasticNet Regression

In [15]:
# ElasticNet Regression
# fit a model to the data
from sklearn.linear_model import ElasticNet
model = ElasticNet(alpha=0.1, random_state=33)
model.fit(features_train, target_train)
print(model)
# make predictions
expected = target_test
predicted = model.predict(features_test)
# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print(mse)
print(model.score(features_test, target_test))

ElasticNet(alpha=0.1, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=33, selection='cyclic', tol=0.0001, warm_start=False)
0.5774250485335829
0.6187926770447563


In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, roc_auc_score
#Notice I did not load my Datamining packages yet (ie decision tree). I will do that as I use them.

expected = target_test
predicted = model.predict(features_test)

## create threshold (mean) for testing accuracy of classification
expected = np.where(expected > c.iloc[:,0].values.mean(), 1, 0)
predicted = np.where(predicted > c.iloc[:,0].values.mean(), 1, 0)

## output predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(expected, predicted)
print("DT Accuracy Score", accuracy_score(expected, predicted))
print("AUC:" ,auc(false_positive_rate, true_positive_rate))
print(classification_report(expected, predicted))
print(confusion_matrix(expected, predicted))

DT Accuracy Score 0.8666666666666667
AUC: 0.8733031674208145
             precision    recall  f1-score   support

          0       0.80      0.92      0.86        13
          1       0.93      0.82      0.87        17

avg / total       0.88      0.87      0.87        30

[[12  1]
 [ 3 14]]


In [17]:
## create Grid Search to optimize tuning parameters
# use a full grid over all parameters
param_grid = {"alpha": [.001, .1, 1, 10, 100]}
start_time = time.clock()


# run grid search
grid_search = GridSearchCV(model, param_grid=param_grid,n_jobs=-1, cv=5)
grid_search.fit(features_train, target_train)


#print(grid_search.cv_results_)
#print("SCORES", grid_search.cv_results_)
print("BEST SCORE", grid_search.best_score_)
print("BEST PARAM", grid_search.best_params_)
print("Time to run", time.clock() - start_time, "seconds")

BEST SCORE 0.5297170951266009
BEST PARAM {'alpha': 0.001}
Time to run 0.21548999999999996 seconds


### Model 4 - OLS regression

In [18]:
from sklearn import linear_model

model = linear_model.LinearRegression()

model.fit(features_train, target_train)
print(model)
# make predictions
expected = target_test
predicted = model.predict(features_test)
# summarize the fit of the model
mse = np.mean((predicted-expected)**2)
print("MSE" ,mse)
print("R-squared" ,model.score(features_test, target_test))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
MSE 0.6211384548198878
R-squared 0.5899337443919792


In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, roc_auc_score
#Notice I did not load my Datamining packages yet (ie decision tree). I will do that as I use them.

expected = target_test
predicted = model.predict(features_test)

## create threshold (mean) for testing accuracy of classification
expected = np.where(expected > c.iloc[:,0].values.mean(), 1, 0)
predicted = np.where(predicted > c.iloc[:,0].values.mean(), 1, 0)

## output predictions
false_positive_rate, true_positive_rate, thresholds = roc_curve(expected, predicted)
print("AUC:" ,auc(false_positive_rate, true_positive_rate))
print(classification_report(expected, predicted))
print(confusion_matrix(expected, predicted))

AUC: 0.9117647058823529
             precision    recall  f1-score   support

          0       0.81      1.00      0.90        13
          1       1.00      0.82      0.90        17

avg / total       0.92      0.90      0.90        30

[[13  0]
 [ 3 14]]


## END