# FillNA

In [None]:
# replacing null values with mode of the column
def replace_nan(df):
    for column in df.columns:
        if df[column].isna().sum() > 0:
            df[column] = df[column].fillna(df[column].mode()[0])
replace_nan(data)

# Outliers

In [None]:
# Checking outliers
fig, ax = plt.subplots(figsize=(25, 10))
sns.boxplot(data=data.iloc[:,1:], orient="h", palette="Set2", ax=ax)

In [None]:
quart1, quart2 = data['MonthlyMinutes'].quantile([0.25,0.75])
iqr = quart2 - quart1

lowerBound = quart1 - (1.5*iqr)
upperBound = quart2 + (8*iqr)

data['Anomalies'] = ((data['MonthlyMinutes']>upperBound) | (data['MonthlyMinutes']<lowerBound).astype('int'))

anomaly = data[data['Anomalies'] == 1]
_ = plt.figure(figsize=(15,5))
_ = plt.plot(data['MonthlyMinutes'], color='blue', label='Normal')
_ = plt.plot(anomaly['MonthlyMinutes'], linestyle='none', marker='X', color='red', label='Anomaly')

# Correlation

In [None]:
# correlation control
data.corr().style.background_gradient(cmap="Blues")

In [None]:
# displaying columns have greater than 0.80 correlation with some other columns
corr = data_1.corr().abs()
upper_tri = corr.where(np.triu(np.ones(corr.shape),k=1).astype(np.bool))
corr_columns = [column for column in upper_tri.columns if any(upper_tri[column] > 0.80)]
upper_tri[corr_columns][upper_tri[corr_columns] > 0.80]

# Stratified Sampling Data 

In [None]:
#n = min(1000000, data['EVENT_COLUMN'].value_counts().min()) # For dividing your data into desired number of elements for each group elemet
#data = data.groupby('EVENT_COLUMN').apply(lambda x: x.sample(n=n, random_state=1))
data = data.groupby('EVENT_COLUMN').apply(lambda x: x.sample(frac=0.2, random_state=1))
data = data.droplevel(0)
data = data.reset_index(drop=True)

# XGBSE

## Forecasting Function

In [1]:
def forecasting(model, test_data, pred_duration, conditional_after=True):
    if conditional_after:
        pred_df, upper_ci, lower_ci = model.predict(test_data, return_ci = True)
        pred_df['PRED_DURATION'] = pred_duration
        pred_df['PRED_DURATION'].loc[pred_df['PRED_DURATION'] < 0] = 1
        
        for idx, row in pred_df.iterrows():
            duration = row['PRED_DURATION']
            pred_df.loc[idx] = row / row[duration]
        
        pred_df = pred_df.drop('PRED_DURATION', axis=1) # in order not to make PRED_DURATION column 1.
        pred_df[pred_df > 1.0] = 1 #if there is some values less than '1.0' (it may be 0 or -1).
        
    else:
        pred_df, upper_ci, lower_ci = model.predict(test_data, return_ci = True)
    return pred_df

# Logger

In [None]:
import os
import logging
from logging.handlers import TimedRotatingFileHandler
from pathlib import Path

def setup_custom_logger(project_folder="logs"):

    """
    This function creates a logger object with rotating file handler.

    Parameters
    ----------
    project_folder : str

    Returns
    -------
    logger : logging.Logger
        logger object
    """

    # create folder path and file path
    base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    folder_path = os.path.join(base_dir, project_folder)
    file_path = os.path.join(folder_path, "analytic.log")

    # create folder if not exists
    Path(folder_path).mkdir(parents=True, exist_ok=True)

    # create logger object
    logger = logging.getLogger(project_folder)
    logger.setLevel(logging.INFO)

    # create formatter
    formatter = logging.Formatter(fmt="{asctime} {levelname:5} {filename}:{funcName}:{lineno} - {message}", style="{")
    
    # create rotating file handler
    rotating_file_handler = TimedRotatingFileHandler(filename=file_path, when='D', interval=30, backupCount=6)
    rotating_file_handler.setFormatter(formatter)

    # add rotating file handler to logger
    logger.addHandler(rotating_file_handler)

    return logger


In [None]:
logger = setup_custom_logger(project_folder="logs") # define it before usage
# then use it with calling logger
logger.info("Execution Process Started")


# Split Data Into Chunks

In [None]:
def split_into_chunks(df, chunk_size = 10000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size else 0)
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks
#pd.concat(chunks, ignore_index=True) # if you want to concat chunks

# Use Apply on Multiple Columns

In [1]:
def quality(total_bill, tip):
    if tip/total_bill > 0.25:
        return 'Generous'
    else:
        return 'Other'

In [None]:
df['Tip Quality'] = df[['total_bill', 'tip']].apply(lambda df: quality(df['total_bill'], df['tip']), axis=1)

# OR

df['Tip Quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])

# Both do the same, but np.vectorize is more faster though np.vectorize is not bult for performance

# Use Apply with args

In [None]:
def pred_manipulation(row, pred_duration):
    row= row.tolist()
    new_row = []
    for prob in row:
        if prob != 1.0:
            new_row.append(prob)
    if len(new_row) > pred_duration:
        append_count = pred_duration - len(new_row)
        for i in range(append_count):
            new_row.append(0)
    return new_row

In [None]:
# swifter is used to perform faster
# args needs to get tuple, in this case funciton has one extra parameter which is pred_duration and we need to give it when calling the function.
#   So, args = (parameter,) -> this is a tuple with one parameter and yes comma (,) is needed.
pred_df = pd.DataFrame(data=pred_df.swifter.apply(pred_manipulation, axis=1, args=(len(pred_df.columns),).tolist()),
                       columns=pred_df.columns)

# Find Code Performance Time

In [None]:
import timeit

setup = """
import numpy as np
import pandas as pd
df = pd.read_csv('data.csv')
def quality(total_bill, tip):
    if tip/total_bill > 0.25:
        return 'Generous'
    else:
        return 'Other'
"""
stmt_one = """
df['Tip Quality'] = df[['total_bill', 'tip']].apply(lambda df: quality(df['total_bill'], df['tip']), axis=1)
"""

stmt_two = """
df['Tip Quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])
"""
timeit.timeit(setup=setup, stmt=stmt_one, number=100)
timeit.timeit(setup=setup, stmt=stmt_two, number=100)

# Useful Methods

In [None]:
# describe
df['total_bill'].describe().apply(lambda x: format(x, 'f'))

# max() Index Location
df['total_bill'].idxmax()
df.iloc[df['total_bill'].idxmax()]

# min() Index Location
df['total_bill'].idxmin()
df.iloc[df['total_bill'].idxmin()]

# Multiple Replace
df['sex'].replace(['Female', 'Male'], ['F', 'M'])
#or
dictmap = {'Female' : 'F', 'Male' : 'M'}
df['sex'].map(dictmap)

# Between Method
df[df['total_bill'].between(10, 20, inclusive=True)]

# nlargest/nsmallest
df.nlargest(8, 'tip')    |   df.sort_values('tip', ascending=False).iloc[0:8] # Both give the same output but nlargest is more powerfull

df.nsmallest(8, 'tip')    |   df.sort_values('tip', ascending=True).iloc[0:8] # Both give the same output but nsmallest is more powerfull

# dropna
df.dropna(thresh=3) # gives the rows that have at least 3 notnull columns
df.dropna(subset=['last_name']) # onyl dropna of last_name column

# groupby
df.groupby('model_year').describe() # gives describe of all columns according to model_year

year_cyl = df.groupby(['model_year', 'cylinders']).mean()
year_cyl.index.names # gives the names (['model_year', 'cylinders'])
year_cyl.index.levels # gives the values of above groups [[70,71,72,73], [2,3,4,5,6]]
year_cyl.loc[[70,80]] # gives the values of model_year groups 70 and 73
year_cyl.xs(key=70, level='model_year') # gives all values of group model_year=70
year_cyl.xs(key=5, level='cylinders') # gives all values of group cylinders=5
year_cyl.swaplevel() # gives each level

# merge
pd.merge(registar, login, how='inner', on='name', suffixes = ('_reg', '_log')) 
# suffixes use the change column name if both dataset has the same column name of columns
# like: registar has 'name', 'id' and login has 'name', 'id' --> merge dataset has 'name', 'id_reg', 'id_log'

# datetime
euro_date = '10-12-2000'    # 10december2000
pd.to_datetime(euro_date)   # gives 2000-10-12 means 12october2000 (makes it american datetime)
pd.to_datetime(euro_date, dayfirst=True) # gives 2000-12-10 means 10december2000




# Visualization

In [None]:
data['column_name'].value_counts().plot(kind='pie',
                                        figsize=(15,8),
                                        autopct='%1.0f%%',
                                        explode=[0.04, 0.04, 0.04, 0.04, 0.04], # write it as many as distinct elemts are
                                        colors=['ping', 'tomato', 'cornflowerblue', 'orange', 'orchid'],
                                        shadow=True)


# Excel Writer

In [None]:
with pd.ExcelWriter('tables.xlsx') as writer:
    t_mp_current_stock.to_excel(writer, sheet_name='current_stock', index=False)
    t_mp_holidays.to_excel(writer, sheet_name='holidays', index=False)
    t_mp_lead_time.to_excel(writer, sheet_name='lead_time', index=False)
    t_mp_main_sales[:900000].to_excel(writer, sheet_name='main_sales', index=False)
    t_mp_main_sales[900000:].to_excel(writer, sheet_name='main_sales_2', index=False)
    t_mp_main_stock_daily[:900000].to_excel(writer, sheet_name='main_stock_daily', index=False)
    t_mp_main_stock_daily[900000:1800000].to_excel(writer, sheet_name='main_stock_daily_2', index=False)
    t_mp_main_stock_daily[1800000:].to_excel(writer, sheet_name='main_stock_daily_3', index=False)

# Date to MontlyDate

In [None]:
df["DATE_MONTHLY"] = df["DATE"].dt.to_period('M')

# Lag Creation

In [None]:
def lag_creation(df, n_lags=3):
    for col in df.columns:
        for i in range(1, n_lags+1):
            df[col+f'_LAG_{i}'] = df[col].shift(i)
    return df

# Folder Creation

In [None]:
# Day period
day = 3

class FileOperation:
    """
    This class helps to create files, load/dump created model, and also save figures.
    """
    def _init_(self):
        pass
    
    # At each 3-day period, files can be removed with usinf file_date_remove function.
    def file_date_remove(self, file_name:str):
        #file_list = gLob(f"{file_name｝**）
        #for file_name in file_list：
        if os.stat(file_name).st_mtime <= time.time() - day * 86400:
            os.remove(file_name)
            print (f"File removed. File Path: (file_name)")
    # Create folder with desired folder path.
    def create_folder(self, folder_path: str):
        Path(folder_path).mkdir(parents=True, exist_ok=True)
        
    def joblib_file_load(self, file_name:str):
        return joblib.load(filename=file_name)
    
    def joblib_file_dump(self, data, file_name:str):
        joblib.dump(data, filename=file_name)
        
    def figure_dump(self, fig, file_name:str):
        pass

In [None]:
# Files creation
file_operation = FileOperation()
# Create folder for datasets to save them inside later
datasets = file_operation.create_folder(f"{folder_path}")

# DB Connections

## MS SQL

In [None]:
from sqlalchemy import create_engine

connection_string = f'mssql+pyodbc://{server_name}/{db_name}?driver=ODBC Driver 17 for SQL Server'
engine = create_engine(connection_string, echo=False, fast_executemany=True)

In [None]:
query = f"""
    SELECT * FROM {table_name}
"""
df = pd.read_sql(query, engine)

# Model Functions

## LSTM

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Activation, Bidirectional, Dropout

In [None]:
class LstmModelling:
    def __init__(self, x_train, y_train, x_test, y_test, epochs, batch_size, verbose):
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        
    def lstm_fit(self):
        
        self.model.fit(self.x_train, self.x_train,
                       epochs=self.epochs,
                       batch_size=self.batch_size,
                       verbose=self.verbose,
                       validation_data=(self.x_test, self.y_test))
        return self.lstm_evaluate_model()
        
    def lstm_model(self, n_steps, n_features, add_lstm_units, dropout, lstm_units=32, predict_value_num=1,
                activation_function='relu', loss='mean_squared_error', optimizer='adam', bidirectional=False):
        
        self.model = Sequential()
        if bidirectional:
            self.model.add(LSTM(lstm_units, # the number of LSTM units in the hidden layer
                        activation=activation_function, # activation function
                        input_shape=(n_steps, n_features), # #of time steps and #of features: (X_train.shape[1], X_train.shape[2])
                        dropout=dropout,
                        return_sequences=True) # True -> many-to-many
                    )
        else:
            self.model.add(Bidirectional(LSTM(lstm_units, # the number of LSTM units in the hidden layer
                        activation=activation_function, # activation function
                        input_shape=(n_steps, n_features), # #of time steps and #of features: (X_train.shape[1], X_train.shape[2])
                        dropout=dropout,
                        return_sequences=True) # True -> many-to-many
                    ))
            
        if len(add_lstm_units) != 0:
            for i in add_lstm_units:
                self.model.add(LSTM(i,
                            return_sequences=True))
                if len(dropout) != 0:
                    self.model.add(Dropout(dropout)) # dropping out units: this helps having a network capable of better generalization and less likely to overfit the training data.
        self.model.add(Dense(predict_value_num))
        self.model.compile(loss=loss, optimizer=optimizer)
        
        return self.lstm_fit()
    
    def lstm_evaluate_model(self):
        # Returns MSE (Mean Squared Error)
        test_score = self.model.evaluate(self.x_test, self.y_test)
        print("LSTM Model Score: {:.2f}".format(test_score))
        # The loss is a measure of how well the model is able to predict the correct output, while the accuracy is a measure of how often the model is correct.
        return self.model


## PyCaret

In [None]:
col = df.columns.tolist() 
col.remove('Churn')

In [None]:
s = setup(data=df,
          target='Churn',
          session_id=123,
          fold_shuffle=True,
          numeric_features=col,
          imputation_type='iterative',
          remove_multicollinearity=True,
          multicollinearity_threshold=0.95,
          fix_imbalance=True)

In [None]:
compare_models()

In [None]:
lightgbm = create_model('lightgbm')

In [None]:
lightgbm_tuned = tune_model(lightgbm)

## LightGBM

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score, log_loss
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
from optuna.integration import LightGBMPruningCallback
import optuna

### Optuna

In [None]:
def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgb.LGBMClassifier(objective="binary", **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="binary_logloss",
            early_stopping_rounds=100,
            callbacks=[
                LightGBMPruningCallback(trial, "binary_logloss")
            ],  # Add a pruning callback
        )
        preds = model.predict_proba(X_test)
        cv_scores[idx] = log_loss(y_test, preds)

    return np.mean(cv_scores)

In [None]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=20)

In [None]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

## CatBoost

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
import catboost as cb
from catboost import CatBoostClassifier, Pool
import optuna

### Optuna

In [None]:
X = df_3.drop('Churn', axis=1)
y = df_3['Churn']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.2,
                                                    random_state = 42,
                                                    shuffle=True,
                                                    stratify=y)

In [None]:
X_train_cat = Pool(X, y, feature_names=list(X.columns))

In [None]:
def objective(trial):

    param = {
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        #     "used_ram_limit": "3gb",
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float(
            "bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    gbm = cb.CatBoostClassifier(**param)

    gbm.fit(X_train_cat, eval_set=[
            (X_test, y_test)], verbose=0, early_stopping_rounds=50)

    preds = gbm.predict(X_test)
    pred_labels = np.rint(preds)
    f1 = f1_score(y_test, pred_labels)
    return f1


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=40, n_jobs=1)

In [None]:
print("Number of completed trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("\tBest Score: {}".format(trial.value))
print("\tBest Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
params = {
    'objective': 'CrossEntropy',
    'colsample_bylevel': 0.06108670760673487,
    'depth': 10,
    'boosting_type': 'Plain',
    'random_strength': 51,
    'learning_rate': 0.1698330158794727,
    'bootstrap_type': 'MVS'
}

In [None]:
model = CatBoostClassifier(**params, verbose = True)
model.fit(X_train_cat)

In [None]:
preds = model.predict(X_test)
pred_labels = np.rint(preds)
accuracy = accuracy_score(y_test, pred_labels)
precision = precision_score(y_test, pred_labels)
recall = recall_score(y_test, pred_labels)
f1 = f1_score(y_test, pred_labels)
df_result = pd.DataFrame()
row = {'Model': 'CatBoostClassifier',
        'Accuracy': round(accuracy,3),
        'Precision': round(precision,3),
        'Recall': round(recall,3),
        'F1': round(f1,3),
        'ModelParameters':model.get_all_params()
    }

df_result = df_result.append(row, ignore_index=True)
df_result.head()

In [None]:
plot_confusion_matrix(model, X_test, y_test, xticks_rotation='vertical')  
plt.show()

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_slice(study, params=['depth', 'learning_rate', 'bootstrap_type'])

### SHAP

In [None]:
import shap
shap.initjs()
explainer = shap.TreeExplainer(model)

In [None]:
shap_values = explainer.shap_values(X_train_cat)

In [None]:
shap.summary_plot(shap_values, X_train, plot_type="bar")

# Blending

In [None]:
blended_submission = clean_test_df[["BASE_CUSTOMER_ID"]].rename(columns={"BASE_CUSTOMER_ID":"Id"})

xgb_pred = pd.DataFrame()
xgb_pred["Expected"] = predictions_test_xgb[:,1]
lgb_pred = pd.DataFrame()
lgb_pred["Expected"] = predictions_test_lgb[:,1]
cat_pred = pd.DataFrame()
cat_pred["Expected"] = predictions_test_cat[:,1]

blended_submission["Expected"] = (lgb_pred["Expected"] * 0.5 + xgb_pred["Expected"] * 0.4 + cat_pred["Expected"] * 0.1)