# Scikit-learn Introduction

## Get the data

In [1]:
import pandas as pd

# hides Jupyter warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# The iris dataset uses petal and sepal size to predict species of Iris flower

iris_data = pd.read_csv("iris.csv", names = ["sepal_l", "sepal_w", "petal_l", "petal_w", "species"])

In [3]:
iris_data.head()

Unnamed: 0,sepal_l,sepal_w,petal_l,petal_w,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


## Separate the data into X (features) and y (target)

In [4]:
# return only the values, not the DataFrame
   # capital X usually indicates all of the features that the algorithm will be given to learn from
    
X = iris_data[["sepal_l", "sepal_w", "petal_l", "petal_w"]].values

In [5]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [6]:
# get the categorical target variable names
iris_data["species"].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [7]:
# lowercase y usually indicates the classifications that you would like to predict.
    # Most algorithms require strings to be converted to numbers (you can use .map() to accomplish this).
    
y = iris_data['species'].map({"Iris-setosa":0, 'Iris-versicolor':1, 'Iris-virginica': 2}).values

In [8]:
y

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)

## Split the data into train and test sets

In [9]:
# split dataset into test/train sets using scikit-learn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

In [10]:
X_train.shape, X_test.shape

((105, 4), (45, 4))

## Standardize the data

In [11]:
# standardize features using scikit-learn
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [12]:
# pre-standardized data
X_test[:5]

array([[5.8, 4. , 1.2, 0.2],
       [5.1, 2.5, 3. , 1.1],
       [6.6, 3. , 4.4, 1.4],
       [5.4, 3.9, 1.3, 0.4],
       [7.9, 3.8, 6.4, 2. ]])

In [13]:
# standardized data
X_test_std[:5]

array([[-1.05669938e-15,  2.33905742e+00, -1.42731553e+00,
        -1.26766340e+00],
       [-8.32816412e-01, -1.29051444e+00, -4.15452683e-01,
        -1.14687545e-01],
       [ 9.51790185e-01, -8.06571522e-02,  3.71551755e-01,
         2.69637740e-01],
       [-4.75895093e-01,  2.09708596e+00, -1.37110093e+00,
        -1.01144654e+00],
       [ 2.49844924e+00,  1.85511450e+00,  1.49584381e+00,
         1.03828831e+00]])

# Algorithms
##  Create and evaluate some models

# REGRESSION

### Linear Regression

In [25]:
# Learning the Celsius to Fahrenheit relationship (F = C*1.8 + 32) given examples

# INPUT:  Celsius
X = [[0], [8], [15], [22]]


# OUTPUT:  Fahrenheit  
y = [[32], [46.4], [59], [71.6]]

In [26]:
# Create a Linear Regression model
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

# fit the model to the training data (X and y)
lin_reg.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [27]:
# use the learned model to make a prediction on a new data point
lin_reg.predict([[38]])

array([[100.4]])

In [28]:
lin_reg.coef_

array([[1.8]])

In [29]:
lin_reg.intercept_

array([32.])

# CLASSIFICATION

### Logistic Regression

In [14]:
# binary classifier
from sklearn.linear_model import LogisticRegression


log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [15]:
log_reg.predict(X_test)

array([0, 1, 1, 0, 2, 2, 2, 0, 0, 2, 1, 0, 2, 1, 2, 0, 1, 2, 0, 0, 1, 2,
       2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 2, 2,
       1], dtype=int64)

In [17]:
log_reg.predict_proba(X_test)

array([[9.26701103e-01, 7.32874624e-02, 1.14343567e-05],
       [2.80435365e-01, 6.23007144e-01, 9.65574911e-02],
       [4.79341944e-02, 7.51631375e-01, 2.00434431e-01],
       [9.40151823e-01, 5.98009805e-02, 4.71965298e-05],
       [1.13303388e-03, 2.97854278e-01, 7.01012688e-01],
       [2.72398208e-02, 4.17469090e-01, 5.55291089e-01],
       [3.44247270e-03, 2.74280433e-01, 7.22277094e-01],
       [9.28016630e-01, 7.16560297e-02, 3.27340183e-04],
       [8.53651745e-01, 1.45990653e-01, 3.57601689e-04],
       [8.99229762e-04, 2.25377943e-01, 7.73722827e-01],
       [2.69011925e-02, 5.40807755e-01, 4.32291053e-01],
       [9.32525056e-01, 6.73606657e-02, 1.14278375e-04],
       [5.24023874e-04, 3.24718067e-01, 6.74757909e-01],
       [4.75066650e-02, 5.86482429e-01, 3.66010906e-01],
       [1.83396339e-02, 4.77753541e-01, 5.03906825e-01],
       [8.82971089e-01, 1.16720108e-01, 3.08803089e-04],
       [4.64149328e-02, 7.17011933e-01, 2.36573134e-01],
       [1.65574625e-02, 3.29502

In [16]:
log_reg.score(X_test, y_test)

0.8888888888888888

### Support Vector Machine

In [18]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

1.0

### Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree.score(X_test, y_test)

0.9555555555555556

### K-nearest Neighbors

In [24]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)

0.9777777777777777