In [1]:
import pandas as pd
import numpy as np

- if you want to see docstring for any method, there are two ways
    - pd.read_csv then when curser is after v, hit shift+tab. It will open the docstring. shift+tab again will show more docstring. If you hit shift+tab 3 times, it will show entire docstring in a separate dock
    - put a ? after a method name (and shift+Enter) to read the docstring (this will open the entire docstring in a separate dock). Like pd.read_csv? then shift+Enter

In [2]:
from sklearn import metrics
from sklearn import datasets
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier

def sklearn_dataset_to_df(dataset):
    df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
    df['target'] = dataset.target
    return df

iris = datasets.load_iris()
df = sklearn_dataset_to_df(iris)
X_iris = df.drop(columns=['target'])
y_iris = df['target']
X_s, y_s = X_iris.loc[:,:'sepal width (cm)'],y_iris
X_p, y_p = X_iris.loc[:,'petal length (cm)':'petal width (cm)'], y_iris
X_s_train, X_s_test, y_s_train, y_s_test = train_test_split(X_s,y_s,test_size=0.25,random_state=33)
X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p,y_p,test_size=0.25,random_state=33)
#always scale before running predictives
scaler_s = StandardScaler().fit(X_s_train)
scaler_p = StandardScaler().fit(X_p_train)
X_s_train_scaled = scaler_s.transform(X_s_train)
X_p_train_scaled = scaler_p.transform(X_p_train)
X_s_test_scaled = scaler_s.transform(X_s_test)
X_p_test_scaled = scaler_p.transform(X_p_test)

clf_p = SGDClassifier()
clf_s = SGDClassifier()

clf_s.fit(X_s_train_scaled, y_s_train)
clf_p.fit(X_p_train_scaled, y_p_train)

# always run predictions on test to check 
y_s_pred = clf_s.predict(X_s_test_scaled)
y_p_pred = clf_p.predict(X_p_test_scaled)

#get metrics
print("Sepal accuracy")
print("______________")
print(metrics.accuracy_score(y_s_test,y_s_pred))
print(metrics.classification_report(y_s_test, y_s_pred, target_names=iris.target_names))
print(metrics.confusion_matrix(y_s_test,y_s_pred))
print("Petal accuracy")
print("______________")
print(metrics.accuracy_score(y_p_test,y_p_pred))
print(metrics.classification_report(y_p_test, y_p_pred, target_names=iris.target_names))
print(metrics.confusion_matrix(y_p_test,y_p_pred))

Sepal accuracy
______________
0.6578947368421053
             precision    recall  f1-score   support

     setosa       1.00      0.88      0.93         8
 versicolor       0.44      0.36      0.40        11
  virginica       0.64      0.74      0.68        19

avg / total       0.66      0.66      0.65        38

[[ 7  0  1]
 [ 0  4  7]
 [ 0  5 14]]
Petal accuracy
______________
0.9473684210526315
             precision    recall  f1-score   support

     setosa       1.00      1.00      1.00         8
 versicolor       0.91      0.91      0.91        11
  virginica       0.95      0.95      0.95        19

avg / total       0.95      0.95      0.95        38

[[ 8  0  0]
 [ 0 10  1]
 [ 0  1 18]]




In [6]:
# use cross validation to improve score
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_model', SGDClassifier())
])
# create a k-fold cross validation iterator of k=5 folds
cv = KFold(X_s_train.shape[0], 5, shuffle=True, random_state=33)
scores_s = cross_val_score(clf, X_s_train, y_s_train, cv=cv)
scores_p = cross_val_score(clf, X_p_train, y_p_train, cv=cv)



In [7]:
print(scores_s)
print(scores_p)

[0.86956522 0.69565217 0.72727273 0.81818182 0.90909091]
[0.73913043 0.95652174 0.77272727 0.59090909 0.90909091]
