# Scikit-learn intro

## Get the data

In [1]:
import numpy as np
import pandas as pd

In [2]:
iris_data = pd.read_csv("iris.csv", names = ["sepal_l", "sepal_w", "petal_l", "petal_w", "class"])

## Explore the data

In [3]:
iris_data.shape

(150, 5)

In [5]:
iris_data[:5]

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
iris_data.describe()

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [None]:
import seaborn as sns
pair_plot = sns.pairplot(iris_data, hue="class")


## Separate the data into X (features) and y (target)

In [7]:

# Get X and y
X = iris_data[["sepal_l", "sepal_w", "petal_l", "petal_w"]].values

# Transform 'Iris-virginica' to be the positive class (binary classification)
y = iris_data['class'].map({"Iris-setosa":0, 'Iris-versicolor':1, 'Iris-virginica': 2}).values

In [8]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [9]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

## Split the data into train and test sets

In [10]:
# split dataset into test/train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)


In [11]:
X_train.shape, X_test.shape

((105, 4), (45, 4))

In [12]:
# standardize features using scikit-learn
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [13]:
X_train[:5]

array([[7.7, 2.6, 6.9, 2.3],
       [5.7, 3.8, 1.7, 0.3],
       [5. , 3.6, 1.4, 0.2],
       [4.8, 3. , 1.4, 0.3],
       [5.2, 2.7, 3.9, 1.4]])

In [14]:
X_train_std[:5]

array([[ 2.26050169, -1.04854298,  1.77691682,  1.4226136 ],
       [-0.11897377,  1.8551145 , -1.14624252, -1.13955497],
       [-0.95179019,  1.37117159, -1.31488633, -1.2676634 ],
       [-1.18973773, -0.08065715, -1.31488633, -1.13955497],
       [-0.71384264, -0.80657152,  0.09047874,  0.26963774]])

# Algorithms
##  Create and evaluate some models

# REGRESSION

### Linear Regression

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

# We're using the feature matrix "X"
# "sepal_l", "sepal_w", "petal_l" are the features we're using to predict "petal_w"
X_lin = X[:,:-1] 
# "petal width" (the last feature)is the target we want to predict
y_lin = X[:, -1]

In [None]:
X_lin[:5]

In [None]:
y_lin[:5]

In [None]:
# create a test data point (sample)
    # I just recreated the first row since I know what the Target value is and thus what the prediction should be.
new_data_pt = np.array([5.1, 3.5, 1.4]).reshape(1,-1) # reshape() converts 1D array into 2D;

np.array([5.1, 3.5, 1.4]), new_data_pt

In [None]:
# predict the "petal length" (y) for new_X
lin_reg.fit(X_lin, y_lin)
lin_reg.predict(new_data_pt)

# CLASSIFICATION

### Logistic Regression

In [None]:
# binary classifier
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg.predict(X_test)

In [None]:
log_reg.score(X_test, y_test)

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')
#svm = SVC(kernel='rbf', C=1)
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree.score(X_test, y_test)

### K-nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)