# Scikit-learn tutorial

https://www.youtube.com/watch?v=0B5eIE_1vpU 

## 1.Scikit-learn

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import seaborn as sns

X, y = fetch_california_housing(return_X_y=True)
model = KNeighborsRegressor()
model.fit(X, y)
pred = model.predict(X)


sns.set()
plt.scatter(pred, y, alpha = 0.01)
plt.show()

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot as plt
import seaborn as sns

model = LinearRegression()
model.fit(X, y)
pred = model.predict(X)
sns.set()
plt.scatter(y, pred, alpha = 0.01)
plt.show()

In [None]:
import seaborn as sns 

df = sns.load_dataset('penguins')

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

X, y = fetch_california_housing(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)

model = make_pipeline(
    StandardScaler(),
    KNeighborsRegressor()
)

model.fit(X_train, y_train)

pred = model.predict(X_test)

sns.set()
plt.scatter(pred, y_test, alpha = 0.05)
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
import pandas as pd

cv = GridSearchCV(
    estimator = model, 
    param_grid = {
        'kneighborsregressor__n_neighbors': range(1, 11)
    }, 
    cv = 3
)

cv.fit(X_train, y_train)
pd.DataFrame(cv.cv_results_)

In [None]:
from sklearn.datasets import fetch_california_housing

california_housing = fetch_california_housing()
print(california_housing['DESCR'])

## 2. Preprocessing

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
sns.set()

In [None]:
# load dataset
df = pd.read_csv('data/preprocessing/drawndata1.csv')
X = df[['x', 'y']]
y = df['z'] == 'a'

# plot
plt.scatter(
    X['x'], X['y'], 
    c = y, 
    cmap = 'RdYlBu',
    alpha = 0.5
)
plt.show()

In [None]:
# standardise
X_new = StandardScaler().fit_transform(X)
plt.scatter(
    X_new[:, 0], X_new[:, 1],
    c = y,
    cmap = 'RdYlBu', 
    alpha = 0.5
)

In [None]:
# generate data
x = np.random.exponential(10, (1000)) + np.random.normal(0, 1, (1000))

plt.hist((x - np.mean(x)) / np.std(x), 30)
plt.show()

In [None]:
from sklearn.preprocessing import QuantileTransformer

X_new = QuantileTransformer().fit_transform(X)
plt.scatter(
    X_new[:, 0], X_new[:, 1],
    c = y,
    cmap = 'RdYlBu',
    alpha = 0.5
)
plt.show()

In [None]:
df = pd.read_csv('data/preprocessing/drawndata2.csv')
X = df[['x', 'y']]
y = df['z'] == 'a'
plt.scatter(
    X['x'], X['y'],
    c = y,
    cmap = 'RdYlBu',
    alpha = 0.5
)
plt.show()

In [None]:
pipe = make_pipeline(
    QuantileTransformer(),
    LogisticRegression()
)

pred = pipe.fit(X, y)
plt.scatter(
    X['x'], X['y'],
    c = pred.predict(X),
    cmap = 'RdYlBu',
    alpha = 0.5
)
plt.show()

In [None]:
pipe = make_pipeline(
    PolynomialFeatures(),
    LogisticRegression()
)

pred = pipe.fit(X, y)
plt.scatter(
    X['x'], X['y'],
    c = pred.predict(X),
    cmap = 'RdYlBu',
    alpha = 0.5
)
plt.show()

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(sparse = False, handle_unknown = 'ignore')

In [None]:
arr = np.array(['low', 'low', 'high', 'medium']).reshape(-1, 1)
enc.fit_transform(arr)

In [None]:
enc.transform([['zero']])

## 3.Metrics

In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [11]:
# load data
df = pd.read_csv('data/metrics/creditcard.csv')[: 80_000]
X = df.drop(columns = ['Time', 'Amount', 'Class'])
y = df['Class']

print(f"fraud: {y.sum()}\nnot fraud: {len(y) - y.sum()}")

fraud: 196
not fraud: 79804


In [19]:
res_0 = LogisticRegression(
    max_iter = 1000
).fit(X, y).predict(X).sum()

res_1 = LogisticRegression(
    class_weight = {0: 1, 1: 2}, # balance the classes
    max_iter = 1000
).fit(X, y).predict(X).sum()

# check how much the param weight can affect the result
print(f"res0: {res_0} res1: {res_1}")

res0: 151 res1: 171


In [24]:
# search the best class weight
grid = GridSearchCV(
    estimator = LogisticRegression(max_iter = 1000), 
    param_grid = {
        # try 1 to 3
        'class_weight': [{0: 1, 1: v} for v in range(1, 4)]
    }, 
    cv = 4, 
    n_jobs = 1
)
pd.DataFrame(grid.fit(X, y).cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,0.519905,0.114973,0.003752,0.001201,"{0: 1, 1: 1}","{'class_weight': {0: 1, 1: 1}}",0.99405,0.99835,0.99945,0.9978,0.997413,0.00203,1
1,0.409188,0.114032,0.003123,0.000522,"{0: 1, 1: 2}","{'class_weight': {0: 1, 1: 2}}",0.99025,0.9984,0.9996,0.99805,0.996575,0.003697,2
2,0.578611,0.062658,0.00316,0.000341,"{0: 1, 1: 3}","{'class_weight': {0: 1, 1: 3}}",0.9873,0.99845,0.9996,0.99815,0.995875,0.00498,3
