In [70]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,roc_auc_score,f1_score,accuracy_score

# 1. Linear Models
## 1.1 OLS

In [2]:
from sklearn import linear_model

In [5]:
housing = datasets.fetch_california_housing()

In [8]:
X = housing['data']
y = housing['target']
X.shape, y.shape

((20640, 8), (20640,))

In [13]:
sum(np.isnan(X))

array([0, 0, 0, 0, 0, 0, 0, 0])

In [14]:
sum(np.isnan(y))

0

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 1)

In [18]:
reg = linear_model.LinearRegression()
reg.fit(X_train,y_train)

LinearRegression()

In [25]:
y_pred = reg.predict(X_test)

In [22]:
reg.coef_

array([ 4.41037995e-01,  9.68801816e-03, -1.04781309e-01,  6.22052706e-01,
       -5.61452681e-06, -3.28792384e-03, -4.23182318e-01, -4.37898856e-01])

In [28]:
mse = np.sum(np.square(y_pred-y_test))/y_test.shape[0]
mse

0.5296293151408283

In [30]:
mean_squared_error(y_test,y_pred)

0.5296293151408283

In [31]:
reg.intercept_

-37.285328998751524

In [33]:
reg.get_params()
#positive:True -> non-negative Least Squares

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': False,
 'positive': False}

## 1.2 Ridge Regression

In [36]:
reg_ridge = linear_model.Ridge(alpha = .5)
# the complexity parameter alpha>0 controls the amount of shrinkage: the larger the value of 
# alpha, the greater the amount of sharinkage and thus the coefficients become more robust to collinearity
reg_ridge.fit(X_train,y_train)

Ridge(alpha=0.5)

In [38]:
reg_ridge.coef_

array([ 4.40972418e-01,  9.68921247e-03, -1.04657013e-01,  6.21406484e-01,
       -5.60959323e-06, -3.28777964e-03, -4.23168999e-01, -4.37876064e-01])

In [39]:
y_pred_ridge = reg_ridge.predict(X_test)

In [40]:
mean_squared_error(y_test,y_pred_ridge)

0.5296312211635243

## 1.3 Lasso Regression

In [42]:
reg_lasso = linear_model.Lasso(alpha=0.1)

In [44]:
reg_lasso.fit(X_train,y_train)

Lasso(alpha=0.1)

In [47]:
y_pred_lasso = reg_lasso.predict(X_test)

In [48]:
mean_squared_error(y_test,y_pred_lasso)

0.6052188834351544

In [49]:
reg_lasso.coef_

array([ 3.95882259e-01,  1.54117664e-02, -0.00000000e+00,  0.00000000e+00,
        1.57546927e-05, -3.06888971e-03, -1.13744058e-01, -1.00746624e-01])

## 1.4 Logistic Regression

In [51]:
breast = datasets.load_breast_cancer()
X = breast['data']
y = breast['target']
X.shape, y.shape

((569, 30), (569,))

In [64]:
X[0]

array([1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
       3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
       8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
       3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
       1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01])

In [61]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 1)

In [53]:
np.sum(np.isnan(X)),np.sum(np.isnan(y))

(0, 0)

In [58]:
#check if unbalanced data
unique, counts = np.unique(y,return_counts=True)
dict(zip(unique,counts)),counts/sum(counts)


({0: 212, 1: 357}, array([0.37258348, 0.62741652]))

In [59]:
#or
from collections import Counter
counter = Counter(y)
counter

Counter({0: 212, 1: 357})

In [68]:
lr = linear_model.LogisticRegression(max_iter=10000)
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
# penalty:
# None: no penalty is added;
# 'l2': add a L2 penalty term and it is the default choice;
# 'l1': add a L1 penalty term;
# 'elasticnet': both L1 and L2 penalty terms are added.

In [69]:
lr.fit(X_train,y_train)

LogisticRegression(max_iter=10000)

In [72]:
y_pred_lr = lr.predict(X_test)
accuracy_score(y_test,y_pred_lr),roc_auc_score(y_test,y_pred_lr),f1_score(y_test,y_pred_lr)

(0.9473684210526315, 0.9384920634920635, 0.9589041095890412)

# 2. Tree-based Models
## 2.1 Desicion Tree

In [74]:
from sklearn import tree

In [76]:
iris = datasets.load_iris()
X,y = iris.data,iris.target
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state = 1)

In [77]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train,y_train)

In [78]:
pred = clf.predict(X_test)

In [81]:
pred

array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       2, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 1, 2,
       1])

In [82]:
y_test

array([0, 1, 1, 0, 2, 1, 2, 0, 0, 2, 1, 0, 2, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 2, 1, 0, 0, 1, 2, 1, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 0, 2, 2,
       1])

In [83]:
accuracy_score(y_test,pred),f1_score(y_test,pred,average='micro')

(0.9555555555555556, 0.9555555555555556)