<a href="https://colab.research.google.com/github/thariqziyad/data-generation/blob/main/st_mu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

The purpose of this Notebook is to figure out what value of standard deviation will give an accurate enough means when `data_gen` is called

# Importing packages

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import truncnorm as t_norm

# Load data source

In [None]:
df = pd.read_csv('source.csv')

df_index = df.iloc[:,0:2]
mult_index = pd.MultiIndex.from_frame(df_index, names=['Dimension', 'No'])
df2 = df.iloc[:,2:].set_index(mult_index,drop=True)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,Service and Facilities,Expectation Value,Perception Value
Dimension,No,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
TANGIBLE,1,Cleanliness and comfort of the hospital treatm...,3.92,3.04
TANGIBLE,2,"Quality of food (punctual, attractive presenta...",3.88,3.08
TANGIBLE,3,Cleanliness and completeness of nurse's room,3.76,3.04
TANGIBLE,4,Cleanliness of public toilets,3.7,2.72
TANGIBLE,5,Clarity of signs indicating the location (clin...,3.54,2.99
TANGIBLE,6,Neatness and appearance of the nurses in the i...,3.53,3.21
ASSURANCE,1,Friendliness and courtesy of inpatient securit...,3.68,3.07
ASSURANCE,2,Friendliness and courtesy of reception staff,3.72,3.11
ASSURANCE,3,Friendliness and courtesy of Hospital informat...,3.79,3.03
ASSURANCE,4,Doctor's ability to handle disease complaints,3.94,3.2


# Build DataFrame to use

With the columns being all values existing in the data source, and the rows being generated standard deviations.

In [None]:
expec_mean = [i for i in set(df2.iloc[:,1])]
percep_mean = [i for i in set(df2.iloc[:,2])]

for i in expec_mean:
  percep_mean.append(i)

mean_list = sorted([i for i in set(percep_mean)])
std_list = [i for i in np.round(np.linspace(0.2,0.35,4, endpoint=False),3)]
std_35 = [i for i in np.round(np.linspace(0.35,0.5,30-9, endpoint=False),3)]
std_56 = [i for i in np.round(np.linspace(0.5,0.65,5),3)]
for i in std_35:
  std_list.append(i)
for i in std_56:
  std_list.append(i)

st_mu = pd.DataFrame(np.empty((len(std_list),len(mean_list))))
mean_index = pd.Index(mean_list,name='mean')
std_index = pd.Index(std_list,name='std')
st_mu = st_mu.set_index(std_index).T.set_index(mean_index)
st_mu.size

1170

# Define `data_gen`

The main function to use.

In [None]:
def data_gen(mu,std,responden):
  a_trunc = 1
  b_trunc = 4
  a, b = (a_trunc - mu) / std, (b_trunc - mu) / std

  #initializing variables
  mu_dif = 5
  std_dif = 5
  iter = 0

  while mu_dif > 0.005 or std_dif > 0.005:
    random_numbers = t_norm.rvs(a, b, loc=mu, scale=std,size=responden)
    if mu>3.5: #after several tries, this code helps with accuracy
      std_b = np.std(random_numbers)
      mu_b = np.mean(random_numbers)
      fin_num = mu + ((random_numbers-mu_b)*std/std_b)
    else:
      fin_num = random_numbers
    integers = np.round(fin_num).astype(int)
    mu_dif = abs(np.mean(integers)-mu)
    std_dif = abs(np.std(integers)-std)
    iter += 1
    if iter > 9999:
      break

  return integers,iter

# Applying function

We fill `st_mu` by using the `data_gen` function.

In [None]:
prc = 0
for mu in range(st_mu.shape[0]):
  for std in range(st_mu.shape[1]):
    res,iter = data_gen(st_mu.index[mu],st_mu.columns[std],155)
    st_mu.iat[mu,std] = iter
    prc += 100 #this line of code is purely for visual satisfaction, it does not affect any following code at all
    print(round(prc/st_mu.size,2),'%',',i:',st_mu.index[mu],'j:',st_mu.columns[std])
    # if st_mu.index[mu]>3 or st_mu.columns[std]>0.3:
    #   break


0.09 % ,i: 2.72 j: 0.2
0.17 % ,i: 2.72 j: 0.238
0.26 % ,i: 2.72 j: 0.275
0.34 % ,i: 2.72 j: 0.312
0.43 % ,i: 2.72 j: 0.35
0.51 % ,i: 2.72 j: 0.357
0.6 % ,i: 2.72 j: 0.364
0.68 % ,i: 2.72 j: 0.371
0.77 % ,i: 2.72 j: 0.379
0.85 % ,i: 2.72 j: 0.386
0.94 % ,i: 2.72 j: 0.393
1.03 % ,i: 2.72 j: 0.4
1.11 % ,i: 2.72 j: 0.407
1.2 % ,i: 2.72 j: 0.414
1.28 % ,i: 2.72 j: 0.421
1.37 % ,i: 2.72 j: 0.429
1.45 % ,i: 2.72 j: 0.436
1.54 % ,i: 2.72 j: 0.443
1.62 % ,i: 2.72 j: 0.45
1.71 % ,i: 2.72 j: 0.457
1.79 % ,i: 2.72 j: 0.464
1.88 % ,i: 2.72 j: 0.471
1.97 % ,i: 2.72 j: 0.479
2.05 % ,i: 2.72 j: 0.486
2.14 % ,i: 2.72 j: 0.493
2.22 % ,i: 2.72 j: 0.5
2.31 % ,i: 2.72 j: 0.538
2.39 % ,i: 2.72 j: 0.575
2.48 % ,i: 2.72 j: 0.612
2.56 % ,i: 2.72 j: 0.65
2.65 % ,i: 2.92 j: 0.2
2.74 % ,i: 2.92 j: 0.238
2.82 % ,i: 2.92 j: 0.275
2.91 % ,i: 2.92 j: 0.312
2.99 % ,i: 2.92 j: 0.35
3.08 % ,i: 2.92 j: 0.357
3.16 % ,i: 2.92 j: 0.364
3.25 % ,i: 2.92 j: 0.371
3.33 % ,i: 2.92 j: 0.379
3.42 % ,i: 2.92 j: 0.386
3.5 % ,i: 2.92

# DataFrame visualization

Using the `background_gradient()` method, we see which values iterates in the function for longer than the given limit.

In [None]:
st_mu.T.astype(int).iloc[:,:16].style.background_gradient()

mean,2.720000,2.920000,2.990000,3.010000,3.030000,3.040000,3.060000,3.070000,3.080000,3.090000,3.110000,3.120000,3.130000,3.200000,3.210000,3.260000
std,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0.2,10000,10000,1416,66,3356,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.238,10000,10000,29,182,44,10000,547,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.275,10000,225,39,47,60,10,52,10000,67,10000,10000,10000,10000,10000,10000,10000
0.312,10000,134,100,8,6,10000,4,10000,126,453,1236,10000,10000,10000,10000,10000
0.35,10000,150,76,319,9,10000,332,10000,54,542,7021,945,9809,10000,10000,10000
0.357,10000,232,30,42,257,81,29,430,512,10000,10000,7605,10000,10000,10000,10000
0.364,10000,121,33,268,251,10000,2505,903,884,2620,10000,5281,10000,10000,10000,10000
0.371,10000,346,2983,3315,355,577,249,10000,195,10000,10000,3700,8900,10000,10000,10000
0.379,10000,3033,1648,802,1267,1354,573,6110,2945,9333,10000,10000,10000,10000,10000,10000
0.386,10000,2192,774,2756,588,10000,186,10000,10000,3642,10000,10000,9975,10000,10000,10000


In [None]:
st_mu.T.astype(int).iloc[:,15:30].style.background_gradient()

mean,3.260000,3.280000,3.360000,3.530000,3.540000,3.610000,3.680000,3.700000,3.720000,3.740000,3.760000,3.770000,3.780000,3.790000,3.800000
std,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0.2,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.238,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.275,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.312,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.35,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.357,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.364,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.371,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.379,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000
0.386,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000


In [None]:
st_mu.T.astype(int).iloc[:,31:].style.background_gradient()

mean,3.830000,3.840000,3.880000,3.890000,3.900000,3.920000,3.930000,3.940000
std,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.2,10000,10000,10000,10000,10000,10000,10000,10000
0.238,10000,10000,10000,10000,10000,10000,10000,4
0.275,10000,10000,10000,10000,10000,6,10000,889
0.312,10000,10000,10000,12,47,7117,10000,10000
0.35,10000,10000,15,31,10000,10000,10000,10000
0.357,10000,10000,7755,10000,10000,10000,6869,10000
0.364,10000,50,27,10000,10000,10000,10000,10000
0.371,50,110,10000,10000,10000,10000,10000,10000
0.379,19,10000,10000,10000,8381,10000,10000,10000
0.386,93,133,8271,10000,10000,10000,10000,10000


# Write CSV file

Finally, the resulting data is extracted for download.

In [None]:
st_mu.to_csv(path_or_buf='st_mu.csv')