# Scikit-learn tutorial

https://www.youtube.com/watch?v=0B5eIE_1vpU 

## 1.Scikit-learn

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import seaborn as sns

X, y = fetch_california_housing(return_X_y=True)
model = KNeighborsRegressor()
model.fit(X, y)
pred = model.predict(X)


sns.set()
plt.scatter(pred, y, alpha = 0.01)
plt.show()

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import seaborn as sns

model = LinearRegression()
model.fit(X, y)
pred = model.predict(X)
sns.set()
plt.scatter(y, pred, alpha = 0.01)
plt.show()

In [None]:
import seaborn as sns 

df = sns.load_dataset('penguins')

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

model.fit(X_train, y_train)

pred = model.predict(X_test)

sns.set()
plt.scatter(pred, y_test, alpha = 0.05)
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd

cv = GridSearchCV(
    estimator = model, 
    param_grid = {
        'kneighborsregressor__n_neighbors': range(1, 11)
    }, 
    cv = 3
)

cv.fit(X_train, y_train)
pd.DataFrame(cv.cv_results_)

In [None]:
from sklearn.datasets import fetch_california_housing

california_housing = fetch_california_housing()
print(california_housing['DESCR'])

## 2. Preprocessing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
sns.set()

In [None]:
# load dataset
df = pd.read_csv('data/preprocessing/drawndata1.csv')
X = df[['x', 'y']]
y = df['z'] == 'a'

# plot
plt.scatter(
    X['x'], X['y'], 
    c = y, 
    cmap = 'RdYlBu',
    alpha = 0.5
)
plt.show()

In [None]:
# standardise
X_new = StandardScaler().fit_transform(X)
plt.scatter(
    X_new[:, 0], X_new[:, 1],
    c = y,
    cmap = 'RdYlBu', 
    alpha = 0.5
)

In [None]:
# generate data
x = np.random.exponential(10, (1000)) + np.random.normal(0, 1, (1000))

plt.hist((x - np.mean(x)) / np.std(x), 30)
plt.show()

In [None]:
from sklearn.preprocessing import QuantileTransformer

X_new = QuantileTransformer().fit_transform(X)
plt.scatter(
    X_new[:, 0], X_new[:, 1],
    c = y,
    cmap = 'RdYlBu',
    alpha = 0.5
)
plt.show()

In [None]:
df = pd.read_csv('data/preprocessing/drawndata2.csv')
X = df[['x', 'y']]
y = df['z'] == 'a'
plt.scatter(
    X['x'], X['y'],
    c = y,
    cmap = 'RdYlBu',
    alpha = 0.5
)
plt.show()

In [None]:
pipe = make_pipeline(
    QuantileTransformer(),
    LogisticRegression()
)

pred = pipe.fit(X, y)
plt.scatter(
    X['x'], X['y'],
    c = pred.predict(X),
    cmap = 'RdYlBu',
    alpha = 0.5
)
plt.show()

In [None]:
pipe = make_pipeline(
    PolynomialFeatures(),
    LogisticRegression()
)

pred = pipe.fit(X, y)
plt.scatter(
    X['x'], X['y'],
    c = pred.predict(X),
    cmap = 'RdYlBu',
    alpha = 0.5
)
plt.show()

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

In [None]:
arr = np.array(['low', 'low', 'high', 'medium']).reshape(-1, 1)
enc.fit_transform(arr)

In [None]:
enc.transform([['zero']])

## 3.Metrics