In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pyspark.sql import functions as F
import os
# Models 
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn import linear_model

# Neuronal Network
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense

# standardization
from sklearn.preprocessing import StandardScaler

# folds
from sklearn.model_selection import TimeSeriesSplit

# Evaluation metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

# statistical calculations
from scipy import stats
import statsmodels.api as sm

# visualization
import plotly.express as px
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings("ignore")
random_state = 1601

In [2]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from xgboost.sklearn import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from functools import partial

In [3]:
from lightgbm import plot_importance

In [4]:
'''Customize visualization
Seaborn and matplotlib visualization.'''
import matplotlib.pyplot as plt
import seaborn as sns
#plt.style.use('bmh')                    
sns.set_style({'axes.grid':False}) 

# Cargar Datos

In [5]:
current_path =  os.getcwd()
train_file_path =  './generated_data/train.csv'
train_path = os.path.join(current_path, train_file_path)
print(f'The path file for the train dataset is: {train_path}')

The path file for the train dataset is: C:\Users\Admin\Desktop\finally-datascience\./generated_data/train.csv


In [6]:
train = pd.read_csv(train_path).reset_index(drop=True)

In [7]:
for idx, column in enumerate(train.columns):
    print(f'{idx} {column}')

0 fecha
1 total_calls
2 mean_total_calls_historical_last_3_days
3 max_total_calls_historical_last_3_days
4 stddev_total_calls_historical_last_3_days
5 var_total_calls_historical_last_3_days
6 min_total_calls_historical_last_3_days
7 mean_total_calls_duration_last_3_days
8 max_total_calls_duration_last_3_days
9 stddev_total_calls_duration_last_3_days
10 var_total_calls_duration_last_3_days
11 min_total_calls_duration_last_3_days
12 mean_missing_calls_last_3_days
13 max_missing_calls_last_3_days
14 stddev_missing_calls_last_3_days
15 var_missing_calls_last_3_days
16 min_missing_calls_last_3_days
17 mean_available_time_last_3_days
18 max_available_time_last_3_days
19 stddev_available_time_last_3_days
20 var_available_time_last_3_days
21 min_available_time_last_3_days
22 mean_away_time_last_3_days
23 max_away_time_last_3_days
24 stddev_away_time_last_3_days
25 var_away_time_last_3_days
26 min_away_time_last_3_days
27 mean_busy_time_last_3_days
28 max_busy_time_last_3_days
29 stddev_busy_ti

# Target
y = 'número total de llamadas en los próx 30 dias'

In [8]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
    .master("local")
    .appName("forecasting")
    .getOrCreate()
)

In [9]:
from pyspark.sql.types import *

def create_dataframe_from_schema(schema):
    empty_RDD = spark.sparkContext.emptyRDD()
    return spark.createDataFrame(empty_RDD, schema)

schema = StructType(
        [
            StructField("date_observacional", DateType(), True)
        ]
    )

In [10]:
from pyspark.sql import DataFrame

class PrepareDatasets:
    
    def __init__(self, historical_df: DataFrame,
        x_key: str, historical_key: str):
        self.historical_df = historical_df
        self.x_key = x_key
        self.historical_key = historical_key
        self.keys = [F.col(f'x_df.{self.x_key}')
              ==F.col(f'historical_df.{self.historical_key}')]
    def _prepare_simple_join(self, x_df: DataFrame, consider_columns: list):
        

        filt_historical_df = (self.historical_df
                                  .select(consider_columns+[self.historical_key])
                             )
        x_new_df = x_df.alias('x_df').join(filt_historical_df
                                     .alias('historical_df'),
                    on=self.keys, how='left')
        
        return x_new_df.drop(self.historical_key)
    
    def _prepare_agg(self, x_df,
                consider_columns: list, dias_comp: list ):
        
        emp_df = create_dataframe_from_schema(schema)
        
        keys = [F.col(f'x_df.{self.x_key}')
              >=F.col(f'historical_df.{self.historical_key}')]
        
        # x_df = spine_df
        full_df = (x_df.alias('x_df')
                  .join(self.historical_df
                        .alias('historical_df'), 
                on=keys, how='left')
                  .withColumn('diff_days', F.datediff(self.x_key, self.historical_key) )
              )
        
        for days_i in dias_comp:
            for column_i in consider_columns:
                
                filter_df = (full_df.filter(F.col('diff_days')<=days_i)
                                    .groupby(self.x_key)
                                    .agg(
                                       (
                                            F.mean(
                                                F.col(column_i)
                                            )
                                        ).alias(f"mean_{column_i}_last_" + str(days_i) + "_days")
                                        ,
                                        (
                                            F.max(
                                                F.col(column_i)
                                            )
                                        ).alias(f"max_{column_i}_last_" + str(days_i) + "_days")
                                        ,
                                        (
                                            F.stddev(
                                                F.col(column_i)
                                            )

                                        ).alias(f'stddev_{column_i}_last_{str(days_i)}_days'),
                                        (
                                            F.variance(
                                                 F.col(column_i)
                                             ).alias(f"var_{column_i}_last_" + str(days_i) + "_days")
                                         ),
                                        (
                                            F.min(
                                                F.col(column_i)
                                            )
                                        ).alias(f"min_{column_i}_last_" + str(days_i) + "_days")


                                    )
                            )
                emp_df = emp_df.join(filter_df, on=self.x_key, how='full')
                
        return emp_df
    
    def run_all(self, x_df,
            consider_columns: list, dias_comp: list ):
        
        x_og = x_df.select(self.x_key)
        x_agg_1 = self._prepare_simple_join(x_df, consider_columns)
        x_agg_2 = self._prepare_agg(x_df, consider_columns, dias_comp)
        
        x_new = (x_og
                 .join(x_agg_1, on=[self.x_key], how='left')
                 .join(x_agg_2, on=[self.x_key], how='left')
                )
        return x_new

In [11]:
# CARGAR EL ARCHIVO CSV DE LA raw data

In [12]:
current_path =  os.getcwd()
raw_data_file_path = './data_raw/train.csv'
raw_data_path = os.path.join(current_path, raw_data_file_path) 
print(f'The path file for the raw data is: {raw_data_path}')

The path file for the raw data is: C:\Users\Admin\Desktop\finally-datascience\./data_raw/train.csv


In [13]:
historical_df = (
    spark.read
    .option('header', 'true')
    .format('csv')
    .load(raw_data_path)
)

In [14]:
historical_df.columns

['interval',
 'total_calls',
 'total_calls_duration',
 'missing_calls',
 'available_time',
 'away_time',
 'busy_time',
 'on_a_call_time',
 'after_call_work_time',
 'total_handle_time',
 'occupancy_rate',
 'utilization_rate',
 'shrinkage_rate',
 'agent_headcount']

In [15]:
historical_df.toPandas()

Unnamed: 0,interval,total_calls,total_calls_duration,missing_calls,available_time,away_time,busy_time,on_a_call_time,after_call_work_time,total_handle_time,occupancy_rate,utilization_rate,shrinkage_rate,agent_headcount
0,2017-01-01 00:00:00,227,11880,8,9721,28806,0,14047,4543,18590,49.5666789222123,32.5472276204983,50.4333210777877,56
1,2017-01-01 00:15:00,235,15297,4,10716,34697,0,16636,4932,21568,48.19874292709871,32.2001761693615,51.801257072901294,60
2,2017-01-01 00:30:00,219,13739,5,16380,24186,0,16092,6280,22372,61.571705487940505,35.546092980393404,38.428294512059495,61
3,2017-01-01 00:45:00,256,16534,5,8086,12307,0,16253,5932,22185,71.0954013809949,52.1043731504533,28.904598619005103,57
4,2017-01-01 01:00:00,240,17692,14,8371,25296,0,18225,7217,25442,57.2044866264021,43.042514676276,42.7955133735979,61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67099,2018-11-30 22:45:00,673,93391,2,10799,96238,217301,72372,19346,91718,24.6401926663718,22.044628607687397,75.35980733362821,273
67100,2018-11-30 23:00:00,676,100549,9,17564,100986,182983,82095,20250,102345,29.689411158815304,25.340573143375,70.3105888411847,283
67101,2018-11-30 23:15:00,717,122798,18,27736,104007,244592,70592,42260,112852,28.7391120369102,23.069296608454398,71.2608879630898,298
67102,2018-11-30 23:30:00,747,116056,13,11577,112657,267705,89426,21767,111193,24.401151188952397,22.100164569138894,75.59884881104759,293


In [16]:
#sumamos todas las llamadas de cada hora x dia

In [17]:
historical_df = historical_df.withColumn('fecha', F.to_date(F.col('interval')))

In [18]:
historical_df.limit(5).toPandas()

Unnamed: 0,interval,total_calls,total_calls_duration,missing_calls,available_time,away_time,busy_time,on_a_call_time,after_call_work_time,total_handle_time,occupancy_rate,utilization_rate,shrinkage_rate,agent_headcount,fecha
0,2017-01-01 00:00:00,227,11880,8,9721,28806,0,14047,4543,18590,49.5666789222123,32.5472276204983,50.4333210777877,56,2017-01-01
1,2017-01-01 00:15:00,235,15297,4,10716,34697,0,16636,4932,21568,48.19874292709871,32.2001761693615,51.801257072901294,60,2017-01-01
2,2017-01-01 00:30:00,219,13739,5,16380,24186,0,16092,6280,22372,61.571705487940505,35.546092980393404,38.428294512059495,61,2017-01-01
3,2017-01-01 00:45:00,256,16534,5,8086,12307,0,16253,5932,22185,71.0954013809949,52.1043731504533,28.904598619005103,57,2017-01-01
4,2017-01-01 01:00:00,240,17692,14,8371,25296,0,18225,7217,25442,57.2044866264021,43.042514676276,42.7955133735979,61,2017-01-01


In [19]:
# obtenemos las fechas
spine_df =  historical_df.select(F.col('fecha')).distinct()

In [20]:
target_df = (
    historical_df.groupBy(F.col('fecha'))
    .agg(F.sum(F.col('total_calls')).cast('int').alias('total_calls'))
    .withColumn('fecha', F.add_months(F.col('fecha'), -1))
)

In [21]:
spine_df.toPandas()

Unnamed: 0,fecha
0,2017-08-11
1,2017-09-11
2,2018-05-28
3,2018-08-10
4,2017-01-06
...,...
694,2017-12-21
695,2018-01-07
696,2018-01-10
697,2018-02-03


In [22]:
target_df.toPandas()

Unnamed: 0,fecha,total_calls
0,2017-07-11,15462
1,2017-08-11,13963
2,2018-04-28,27654
3,2018-07-10,29415
4,2016-12-06,7763
...,...,...
694,2017-11-21,21888
695,2017-12-07,18030
696,2017-12-10,17937
697,2018-01-03,25813


In [23]:
prepare_dataset = PrepareDatasets(
    historical_df=historical_df, 
    x_key='fecha',
    historical_key='fecha'
)

col_consider = [
 'total_calls',
 'total_calls_duration',
 'missing_calls',
 'available_time',
 'away_time',
 'busy_time',
 'on_a_call_time',
 'after_call_work_time',
 'total_handle_time',
 'occupancy_rate',
 'utilization_rate',
 'shrinkage_rate',
 'agent_headcount'
]

dias_consider = [3, 7, 15]

In [24]:
data_pro = prepare_dataset.run_all(spine_df, col_consider, dias_consider)

AnalysisException: [AMBIGUOUS_REFERENCE] Reference `fecha` is ambiguous, could be: [`historical_df`.`fecha`, `x_df`.`fecha`].