# Chapter 1 and Chapter 2

### XGBoost

- XGBoost under the hood:
    - uses an ensemble model that uses many weak base CART learners into a strong learner
    - These weak base learners are only slightly better at prediction than pure random chance
    - For each learner, the contribution is calculated as weights
    - To get the final output, the weighted sum of all weak learners determine the model output. 
    - (weight1 * model1 + weight2 + model2 + .. = output)
    - Objective function : a way to quantify how much the prediction is away from the actual value (Another name of loss function)
    - Goal : Use objective function to produce optimal result
- Advantages:
    - Fast and efficient
    - Core algorithm is parallelizable
    - Consistently outperforms single-algorithm methods
    - State-of-the-art performance in many ML tasks
    - Uses CART (Classification and Regression Tree)
    - individual decision trees contain data as decision value at leaves, that is why these trees tend to overfit
    - XGBoost tree contains real-valued score at leaves which are generalized numeric values than can be used as threshold that can even help for classification
    - Can use 2 types of learners:
        1. Linear base learners
        2. Tree-based base learners (non-linear)
- When to use:
    - For > 1000 samples
    - For < 100 features
    - just numeric features or mixture of numeric and categorical features
- When not to use
    - image processing
    - natural language processing

```
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)
# Simple fit-predict
xg_cl = xgb.XGBClassifier(objective='binary:logistic', n_estimators=10, seed=123) # Classification
xg_reg = xgb.XGBRegressor(objective='reg:linear', n_estimators=10, seed=123) # Regression
model.fit(X_train, y_train)
preds = model.predict(X_test)

# Cross validation (Method 1 : Using the xgboost API, it has cv, train, predict which is unlike fit-predict in sklearn)
dmatrix = xgb.DMatrix(data=X_train, label=y_train)
params_clf={"objective":"binary:logistic","max_depth":4}        # Classification parameters
params_reg={"objective":"binary:logistic","booster":"gblinear"}  # Regression parameters with specified base learners
# Regularization parameters: "alpha" for l1, "lambda" for l2, "gamma" for penalty weight for splitting on a node according to tree complexity
cv_results = xgb.cv(dtrain=dmatrix, params=params_clf_reg, nfold=4, num_boost_round=10, 
        metrics="error", as_pandas=True, stratified=True, early_stopping_rounds=10, verbose_eval=1)
# accuracy_cv = 1 - cv_results['test-error-mean'].iloc[-1]
# Train the final model with the best number of boosting rounds
best_num_boost_round = len(cv_results)
final_model = xgb.train(params = params_clf_reg, dtrain = dmatrix, num_boost_round=best_num_boost_round)
# Make predictions on the testing dataset
dtest = xgb.DMatrix(X_test) # ,y_test
y_pred_prob = final_model.predict(dtest)
y_pred_binary = np.round(y_pred_prob)  # Convert probabilities to binary predictions
accuracy_final = accuracy_score(y_test, y_pred_binary)

# Cross validation (Method 2 : Using scikit-learn)
from sklearn.model_selection import cross_val_score, StratifiedKFold
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring='accuracy')
from sklearn.model_selection import cross_val_predict
y_pred_cv = cross_val_predict(xgb_model, X_test, y=None, cv=cv)
accuracy_final = accuracy_score(y_test, y_pred_cv)

# GridSearch / RandomizedSearch (HYPERPARAMETER TUNING)
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
param_grid = {'learning_rate': np.arange(0.05,1.05,.05), 'n_estimators': [200], 'subsample': np.arange(0.05,1.05,.05)}
gbm = xgb.XGBRegressor()
tuning_models = Grid_RandomizedSearchCV(estimator=gbm, param_distributions=param_grid, n_iter=25, 
        scoring='neg_mean_squared_error', cv=4, verbose=1)
tuning_models.fit(X, y)
tuning_models.best_params_ # See the parameters that give the best results
# Visualize tree
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz/bin' ### MAKE SURE TO INSTALL GRAPHVIZ AND ADD THE INSTALLATION PATH
xgb.plot_tree(xg_model, num_trees=0) # , rankdir="LR" for aligning tree sideways from left to right

```

# Chapter 3

### Grid Search and Cross Validation

```
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, make_scorer

kf = KFold(n_splits=5, shuffle=True, random_state=42)
# splits = kf.split(X) # See how they are splitted, each split contains index of training and validation
# r-squared results for 5-fold cross validation score  
mae_scorer = make_scorer(mean_absolute_error)
scores = cross_val_score(your_model, X, y, cv=kf, scoring=mae_scorer)  # a list of error terms
avg_score = np.mean(scores)
# predicted_y results for 5-fold cross validation prediction
predicted_y = cross_val_predict(your_model, X, y, cv=5) # a list of predictions
avg_predicted_y = np.mean(predicted_y)

### example of ridge regression with grid search with k-fold cross validation
param_grid = {"alpha": np.arange(0.0001, 1, 10), "solver": ["sag", "lsqr"]}
ridge = Ridge()
ridge_cv = GridSearchCV(ridge, param_grid, cv=kf)
ridge_cv2 = RandomizedSearchCV(ridge, param_grid, cv=kf, n_iter=2)
ridge_cv.fit(X_train, y_train)
```

# Chapter 4

### Pipeline

```
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

X = df.drop("target", axis = 1).values
y = df["target"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

# Define sequential stages of your model (Only the last step should contain model, others are transformers)
steps = [('scale',StandardScaler()), 
         ('knn', KNeighborsClassifier())]
# Construct the pipeline
pipeline = Pipeline(steps)

# Perform cross validation on pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
custom_scorer = make_scorer(mean_squared_error)
scores = cross_val_score(pipeline,X_train,y_train, scoring= custom_scorer,cv=10) # "neg_mean_squared_error"

# Perform gridsearch on pipeline
parameters = {"knn__n_neighbors": np.arange(1, 50)} # Use format: step-name + __ + parameter_name 
cv = GridSearchCV(pipeline, param_grid=parameters)
# Train
cv.fit(X_train, y_train)
# Predict
y_pred = cv.predict(X_test)

### You can break down the pipeline and add the results of each step in the output
# Create a feature union of transformers : allows you to concatenate the results of multiple transformer objects along the second axis
combined_features = FeatureUnion([
    ('scaler', scaler),
    ('poly_features', poly_features),
    ('pca', pca)
])

# Define the classifier
classifier = RandomForestClassifier(random_state=42)

# Create a pipeline with FeatureUnion and the classifier
pipeline = Pipeline([
    ('features', combined_features),
    ('classifier', classifier)
])
```

### One-hot encoding

```
# Binary Encoding
df["cat_col"] = df["cat_col"].apply(lambda val: 1 if val == "y" else 0)

# One-hot-encoding on categorical variable
df_onehot = pd.get_dummies(df, columns=['cat'], prefix='C')
df_dummy = pd.get_dummies(df, columns=['cat'], drop_first=True, prefix='C')

# Alternative approach-2
from sklearn import preprocessing
encoder = preprocessing.OneHotEncoder()
onehot_transformed = encoder.fit_transform(df['cat_col'].values.reshape(-1,1))
# Convert into dataframe
onehot_df = pd.DataFrame(onehot_transformed.toarray())
# Add the encoded columns with original dataset, 
df = pd.concat([df, onehot_df], axis=1)
# Drop the original column that you used for encoding 
df = df.drop('cat_col', axis=1)

# Label encoding : Turning string labels into numeric values
from sklearn import preprocessing
encoder_lvl = preprocessing.LabelEncoder()
# Specify the unique categories in the column to apply one-hot encoding
encoder_lvl.fit([ 'LOW', 'NORMAL', 'HIGH'])
# Apply one hot encoding on the third column of the dataset
df[:,2] = encoder_lvl.transform(df[:,2]) 

# Alternative approach : DictVectorizer
from sklearn.feature_extraction import DictVectorizer

df_dict = df.to_dict("records") # Convert df into a list of dictionary
dv = DictVectorizer(sparse = False)
df_encoded = dv.fit_transform(df_dict)
print(df_encoded[:5,:]) # Print first five rows
# Print the vocabulary (how the features are mapped to columns in the resulting matrix.)
print(dv.vocabulary_)
```