# Sampling Data 

In [None]:
#n = min(1000000, data['EVENT_COLUMN'].value_counts().min()) # For dividing your data into desired number of elements for each group elemet
#data = data.groupby('EVENT_COLUMN').apply(lambda x: x.sample(n=n, random_state=1))
data = data.groupby('EVENT_COLUMN').apply(lambda x: x.sample(frac=0.2, random_state=1))
data = data.droplevel(0)
data = data.reset_index(drop=True)

# XGBSE

## Forecasting Function

In [1]:
def forecasting(model, test_data, pred_duration, conditional_after=True):
    if conditional_after:
        pred_df, upper_ci, lower_ci = model.predict(test_data, return_ci = True)
        pred_df['PRED_DURATION'] = pred_duration
        pred_df['PRED_DURATION'].loc[pred_df['PRED_DURATION'] < 0] = 1
        
        for idx, row in pred_df.iterrows():
            duration = row['PRED_DURATION']
            pred_df.loc[idx] = row / row[duration]
        
        pred_df = pred_df.drop('PRED_DURATION', axis=1) # in order not to make PRED_DURATION column 1.
        pred_df[pred_df > 1.0] = 1 #if there is some values less than '1.0' (it may be 0 or -1).
        
    else:
        pred_df, upper_ci, lower_ci = model.predict(test_data, return_ci = True)
    return pred_df

# Logger

In [None]:
import os
import logging
from logging.handlers import TimedRotatingFileHandler
from pathlib import Path

def setup_custom_logger(project_folder="logs"):

    """
    This function creates a logger object with rotating file handler.

    Parameters
    ----------
    project_folder : str

    Returns
    -------
    logger : logging.Logger
        logger object
    """

    # create folder path and file path
    base_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    folder_path = os.path.join(base_dir, project_folder)
    file_path = os.path.join(folder_path, "analytic.log")

    # create folder if not exists
    Path(folder_path).mkdir(parents=True, exist_ok=True)

    # create logger object
    logger = logging.getLogger(project_folder)
    logger.setLevel(logging.INFO)

    # create formatter
    formatter = logging.Formatter(fmt="{asctime} {levelname:5} {filename}:{funcName}:{lineno} - {message}", style="{")
    
    # create rotating file handler
    rotating_file_handler = TimedRotatingFileHandler(filename=file_path, when='D', interval=30, backupCount=6)
    rotating_file_handler.setFormatter(formatter)

    # add rotating file handler to logger
    logger.addHandler(rotating_file_handler)

    return logger


In [None]:
logger = setup_custom_logger(project_folder="logs") # define it before usage
# then use it with calling logger
logger.info("Execution Process Started")


# Split Data Into Chunks

In [None]:
def split_into_chunks(df, chunk_size = 10000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size else 0)
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks
#pd.concat(chunks, ignore_index=True) # if you want to concat chunks

# Use Apply on Multiple Columns

In [1]:
def quality(total_bill, tip):
    if tip/total_bill > 0.25:
        return 'Generous'
    else:
        return 'Other'

In [None]:
df['Tip Quality'] = df[['total_bill', 'tip']].apply(lambda df: quality(df['total_bill'], df['tip']), axis=1)

# OR

df['Tip Quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])

# Both do the same, but np.vectorize is more faster though np.vectorize is not bult for performance

# Find Code Performance Time

In [None]:
import timeit

setup = """
import numpy as np
import pandas as pd
df = pd.read_csv('data.csv')
def quality(total_bill, tip):
    if tip/total_bill > 0.25:
        return 'Generous'
    else:
        return 'Other'
"""
stmt_one = """
df['Tip Quality'] = df[['total_bill', 'tip']].apply(lambda df: quality(df['total_bill'], df['tip']), axis=1)
"""

stmt_two = """
df['Tip Quality'] = np.vectorize(quality)(df['total_bill'], df['tip'])
"""
timeit.timeit(setup=setup, stmt=stmt_one, number=100)
timeit.timeit(setup=setup, stmt=stmt_two, number=100)

# Useful Methods

In [None]:
# describe
df['total_bill'].describe().apply(lambda x: format(x, 'f'))

# max() Index Location
df['total_bill'].idxmax()
df.iloc[df['total_bill'].idxmax()]

# min() Index Location
df['total_bill'].idxmin()
df.iloc[df['total_bill'].idxmin()]

# Multiple Replace
df['sex'].replace(['Female', 'Male'], ['F', 'M'])
#or
dictmap = {'Female' : 'F', 'Male' : 'M'}
df['sex'].map(dictmap)

# Between Method
df[df['total_bill'].between(10, 20, inclusive=True)]

# nlargest/nsmallest
df.nlargest(8, 'tip')    |   df.sort_values('tip', ascending=False).iloc[0:8] # Both give the same output but nlargest is more powerfull

df.nsmallest(8, 'tip')    |   df.sort_values('tip', ascending=True).iloc[0:8] # Both give the same output but nsmallest is more powerfull

# dropna
df.dropna(thresh=3) # gives the rows that have at least 3 notnull columns
df.dropna(subset=['last_name']) # onyl dropna of last_name column

# groupby
df.groupby('model_year').describe() # gives describe of all columns according to model_year

year_cyl = df.groupby(['model_year', 'cylinders']).mean()
year_cyl.index.names # gives the names (['model_year', 'cylinders'])
year_cyl.index.levels # gives the values of above groups [[70,71,72,73], [2,3,4,5,6]]
year_cyl.loc[[70,80]] # gives the values of model_year groups 70 and 73
year_cyl.xs(key=70, level='model_year') # gives all values of group model_year=70
year_cyl.xs(key=5, level='cylinders') # gives all values of group cylinders=5
year_cyl.swaplevel() # gives each level

# merge
pd.merge(registar, login, how='inner', on='name', suffixes = ('_reg', '_log')) 
# suffixes use the change column name if both dataset has the same column name of columns
# like: registar has 'name', 'id' and login has 'name', 'id' --> merge dataset has 'name', 'id_reg', 'id_log'

# datetime
euro_date = '10-12-2000'    # 10december2000
pd.to_datetime(euro_date)   # gives 2000-10-12 means 12october2000 (makes it american datetime)
pd.to_datetime(euro_date, dayfirst=True) # gives 2000-12-10 means 10december2000




# Visualization

In [None]:
data['column_name'].value_counts().plot(kind='pie',
                                        figsize=(15,8),
                                        autopct='%1.0f%%',
                                        explode=[0.04, 0.04, 0.04, 0.04, 0.04], # write it as many as distinct elemts are
                                        colors=['ping', 'tomato', 'cornflowerblue', 'orange', 'orchid'],
                                        shadow=True)
