# Scikit-learn intro

## Get the data

In [None]:
import numpy as np
import pandas as pd

In [None]:
iris_data = pd.read_csv("iris.csv", names = ["sepal_l", "sepal_w", "petal_l", "petal_w", "class"])

## Explore the data

In [None]:
iris_data.shape

In [None]:
print(iris_data[:5])

In [None]:
iris_data[:5]

In [None]:
iris_data.describe()

In [None]:
import seaborn as sns
pair_plot = sns.pairplot(iris_data, hue="class")


## Separate the data into X (features) and y (target)

In [None]:

# Get X and y
X = iris_data[["sepal_l", "sepal_w", "petal_l", "petal_w"]].values

# Transform 'Iris-virginica' to be the positive class (binary classification)
y = iris_data['class'].map({"Iris-setosa":0, 'Iris-versicolor':1, 'Iris-virginica': 2}).values

In [None]:
X[:5]

In [None]:
y

## Split the data into train and test sets

In [None]:
# split dataset into test/train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)


In [None]:
X_train.shape, X_test.shape

In [None]:
# Write scalers

def standardize(df):
    return (df-df.mean())/df.std()

def normalization(df):
    return (df - df.min())/df.max()-df.min()



# standardize features using scikit-learn
from sklearn.preprocessing import StandardScaler
stdsc = StandardScaler()

X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [None]:
X_train[:5]

In [None]:
X_train_std[:5]

# Algorithms
##  Create and evaluate some models

# REGRESSION

### Linear Regression

In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()

# We're using the feature matrix "X"
# "sepal_l", "sepal_w", "petal_l" are the features we're using to predict "petal_w"
X_lin = X[:,:-1] 
# "petal width" (the last feature)is the target we want to predict
y_lin = X[:, -1]

In [None]:
X_lin[:5]

In [None]:
y_lin[:5]

In [None]:
# create a test data point (sample)
    # I just recreated the first row since I know what the Target value is and thus what the prediction should be.
new_data_pt = np.array([5.1, 3.5, 1.4]).reshape(1,-1) # reshape() converts 1D array into 2D;

np.array([5.1, 3.5, 1.4]), new_data_pt

In [None]:
# predict the "petal length" (y) for new_X
lin_reg.fit(X_lin, y_lin)
lin_reg.predict(new_data_pt)

# CLASSIFICATION

### Logistic Regression

In [None]:
# binary classifier
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg.predict(X_test)

In [None]:
log_reg.score(X_test, y_test)

### Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel='linear')
#svm = SVC(kernel='rbf', C=1)
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree.score(X_test, y_test)

### K-nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn.score(X_test, y_test)