# Clase 7: Ajustando distribuciones a los datos - casos especiales

Prof. Tito Homem-de-Mello

In [None]:
#install.packages('fitdistrplus')
#install.packages('gglot2')
#install.packages('goft')

In [None]:
library(fitdistrplus)
library(ggplot2)
library(goft)

In [None]:
plottheme <- theme(plot.title = element_text(size = 20),
                  axis.title.y=element_text(size = 20),
                  axis.title.x=element_text(size = 20),
                  axis.text.y=element_text(size = 15),
                  axis.text.x=element_text(size = 15),
                  legend.title = element_blank(),
                  legend.text = element_text(size=15),
                  legend.position = "bottom")

## Miremos un ejemplo con datos de tiempos de preparación de platos

In [None]:
data <- read.csv("preparacion.csv")
head(data)

In [None]:
X <- data$Service.times

In [None]:
bin <- 1
plotfig <- ggplot(data,aes(X))+plottheme
 
p <- plotfig+geom_histogram(aes(y = after_stat(count /(sum(count)))), binwidth=bin,color="black", fill="gray")+
       labs(x="tiempo",
            y="frecuencia",
            title=paste0("Histograma de los datos"))           
p 

## Determinemos la distribución de los datos en cada rango ($<6$, $\geq 6$)

In [None]:
X1 <- X[X<6]
X2 <- X[X>=6]

In [None]:
length(X1)
length(X2)

In [None]:
bin <- 1
plotfig <- ggplot(as.data.frame(X1),aes(X1))+plottheme
 
p <- plotfig+geom_histogram(aes(y = after_stat(count /(sum(count)))), binwidth=bin,color="black", fill="gray")+
       labs(x="tiempo",
            y="frecuencia",
            title=paste0("Histograma de X1"))           
p 

In [None]:
bin <- 1
plotfig <- ggplot(as.data.frame(X2),aes(X2))+plottheme
 
p <- plotfig+geom_histogram(aes(y = after_stat(count /(sum(count)))), binwidth=bin,color="black", fill="gray")+
       labs(x="tiempo",
            y="frecuencia",
            title=paste0("Histograma de X2"))           
p 

### Ajuste para X1

In [None]:
a_norm <- fitdist(X1, "norm")
summary(a_norm)

a_weib <- fitdist(X1, "weibull")
summary(a_weib)

a_gamma <- fitdist(X1, "gamma")
summary(a_gamma)

a_lnorm <- fitdist(X1, "lnorm")
summary(a_lnorm)

plot.legend <- c("Normal", "Weibull", "Gamma", "Lognormal")
denscomp(list(a_norm,a_weib,a_gamma,a_lnorm), fitcol=c(2,4,6,1), fitlty=1, legendtext=plot.legend)

In [None]:
ppcomp(list(a_norm,a_lnorm,a_gamma,a_weib))

In [None]:
qqcomp(list(a_norm,a_lnorm,a_gamma,a_weib))

### Tests estadísticos

In [None]:
gofval <- gofstat(list(a_norm,a_lnorm,a_gamma,a_weib),fitnames=c("norm","lnorm", "gamma", "weib"))
gofval

### Valores-$p$

In [None]:
pvalue <- data.frame(Estadistica="Valor-p",Normal=gofval$chisqpvalue["norm"],
                     Lognormal=gofval$chisqpvalue["lnorm"],Gamma=gofval$chisqpvalue["gamma"],
                     Weibull=gofval$chisqpvalue["weib"])     
pvalue

In [None]:
ntest <- normal_test(X1) #Use shapiro.test(X) si el nro. de datos no está entre 10 y 400
ltest <- lnorm_test(X1)
gtest <- gamma_test(X1)
wtest <- weibull_test(X1)
pvalue<-data.frame(Estadistica="Valor-p",Normal=ntest$p.value,
                   Lognormal=ltest$p.value,Gamma=gtest$p.value,Weibull=wtest$p.value)     
pvalue

### Cuál distribución elegimos para X1?

### Guardemos los parámetros

In [None]:
shape1 <- a_weib$estimate[1]
scale1 <- a_weib$estimate[2]


### Ajuste para X2

In [None]:

a_norm <- fitdist(X2, "norm")
summary(a_norm)

a_weib <- fitdist(X2, "weibull")
summary(a_weib)

a_gamma <- fitdist(X2, "gamma")
summary(a_gamma)

a_lnorm <- fitdist(X2, "lnorm")
summary(a_lnorm)

plot.legend <- c("Normal", "Weibull", "Gamma", "Lognormal")
denscomp(list(a_norm,a_weib,a_gamma,a_lnorm), fitcol=c(2,4,6,1), fitlty=1, legendtext=plot.legend)
















In [None]:
ppcomp(list(a_norm,a_lnorm,a_gamma,a_weib))

In [None]:
qqcomp(list(a_norm,a_lnorm,a_gamma,a_weib))

In [None]:
gofval <- gofstat(list(a_norm,a_lnorm,a_gamma,a_weib),fitnames=c("norm","lnorm", "gamma", "weib"))
gofval

In [None]:
pvalue <- data.frame(Estadistica="Valor-p",Normal=gofval$chisqpvalue["norm"],
                     Lognormal=gofval$chisqpvalue["lnorm"],Gamma=gofval$chisqpvalue["gamma"],
                     Weibull=gofval$chisqpvalue["weib"])     
pvalue

In [None]:
ntest <- normal_test(X2) #Use shapiro.test(X) si el nro. de datos no está entre 10 y 400
ltest <- lnorm_test(X2)
gtest <- gamma_test(X2)
wtest <- weibull_test(X2)
pvalue<-data.frame(Estadistica="Valor-p",Normal=ntest$p.value,
                   Lognormal=ltest$p.value,Gamma=gtest$p.value,Weibull=wtest$p.value)     
pvalue

### Cuál distribución elegimos para X2?

### Guardemos los parámetros

In [None]:
mu2 <- a_norm$estimate[1]
sd2 <- a_norm$estimate[2]



## Generemos muestras de la distribución elegida

### Calculemos la proporción de cada grupo

In [None]:
p1 <- length(X1)/length(X)
p2 <- length(X2)/length(X)
p1
p2

### Generador de la distribución

In [None]:
N <- 1000
Z <- array(rep(0,N))
U <- runif(N,0,1)
for (i in 1:N){
    if (U[i]<= p1) Z[i] <- rweibull(1,shape1,scale1)
    else Z[i] <- rnorm(1,mu2,sd2)
}
bin <- 1
plotfig <- ggplot(as.data.frame(Z),aes(Z))+plottheme
 
p <- plotfig+geom_histogram(aes(y = after_stat(count /(sum(count)))), binwidth=bin,color="black", fill="gray")+
       labs(x="tiempo",
            y="frecuencia",
            title=paste0("Histograma de los datos simulados"))           
p 

In [None]:
bin <- 1
plotfig <- ggplot(data,aes(X))+plottheme
 
p <- plotfig+geom_histogram(aes(y = after_stat(count /(sum(count)))), binwidth=bin,color="black", fill="gray")+
       labs(x="tiempo",
            y="frecuencia",
            title=paste0("Histograma de los datos originales"))           
p 

### Comparemos las distribuciones empíricas de los datos originales y los simulados

In [None]:

colors <- c("Simulada" = "red", "Datos" = "black")


plotfig+geom_function(fun=ecdf(X), aes(color="Datos"))+
        geom_function(fun=ecdf(Z),  aes(color="Simulada"))+
        xlim(4,20)+
        labs(x="x",y="F_N(x)",
             title=paste0("Distribución empírica"),
             color="Legend")+
        scale_color_manual(values = colors)











