<a href="https://colab.research.google.com/github/thariqziyad/data-generation/blob/main/data_gen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing packages

In [84]:
import pandas as pd
import numpy as np
from scipy.stats import truncnorm as t_norm
import warnings

# Loading parameters

In [28]:
df = pd.read_csv('source.csv')
df.head()

Unnamed: 0,Mean,Std Dev
0,39583,",2041"
1,38750,",3378"
2,36667,",4815"
3,37500,",4423"
4,35000,",5108"


In [14]:
df.tail()

Unnamed: 0,Mean,Std Dev
51,31250,",4484"
52,30417,",3586"
53,32083,",4149"
54,30417,",4643"
55,30000,",4170"


In [15]:
df.info()
print("Max mean:",max(df.Mean),"Min mean:", min(df.Mean))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56 entries, 0 to 55
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Mean     56 non-null     int64 
 1   Std Dev  56 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.0+ KB
Max mean: 39583 Min mean: 28333


# Data cleaning

We see that neither columns is considered as Float. We make the necessary edits.

## Mean:
 Integers to floats

In [68]:
df2 = df.assign(Means=lambda x: x.Mean/10000).drop(columns='Mean')
df2.head()

Unnamed: 0,Std Dev,Means
0,",2041",3.9583
1,",3378",3.875
2,",4815",3.6667
3,",4423",3.75
4,",5108",3.5


## Standard Deviation

String to floats

In [71]:
s_str = df2.loc[:,'Std Dev'].str[1:]
s_num = pd.to_numeric(s_str)

df3 = df2.assign(Std_dev=s_num/10000).drop(columns='Std Dev')
print(df3.shape)
df3.head()

(56, 2)


Unnamed: 0,Means,Std_dev
0,3.9583,0.2041
1,3.875,0.3378
2,3.6667,0.4815
3,3.75,0.4423
4,3.5,0.5108


# Generating data

## Create function

To help ease coding

In [191]:
def data_gen(mu,std,responden=10):
  if std<0.2:
    warnings.warn('Standard deviation is less than 0.2; resulting mean and standard deviation may be less accurate.')
  a_trunc = 1
  b_trunc = 4
  a, b = (a_trunc - mu) / std, (b_trunc - mu) / std

  #initializing variables
  mu_dif = 5
  std_dif = 5
  iter = 0

  while mu_dif > 0.005 or std_dif > 0.005:
    # mu_init = np.mean(init_num)
    # std_init = np.std(init_num)
    random_numbers = t_norm.rvs(a, b, loc=mu, scale=std,size=responden)
    std_b = np.std(random_numbers)
    mu_b = np.mean(random_numbers)
    fin_num = mu + ((random_numbers-mu_b)*std/std_b)
    integers = np.round(fin_num).astype(int)
    mu_dif = abs(np.mean(integers)-mu)
    # mu_difi = abs(mu_init-mu)
    std_dif = abs(np.std(integers)-std)
    # std_difi = abs(std_init-std)
    # if mu_dif<mu_difi or std_dif<std_difi:
    #   fin_num=np.round(init_num).astype(int)
    # else:
    #   fin_num=init_num
    # init_num = random_numbers
    iter += 1
    if iter > 10000:
      break
  # print(iter)
  return integers


mu_1 = 3.9498
st_1 = 0.1988
res = data_gen(mu_1,st_1)



# Generate data

In [210]:
res_dict = {}
data_length = 155

for i in df3.index:
  mu_gen = df3.Means[i]
  std_gen = df3.Std_dev[i]
  all_data =  data_gen(mu_gen,std_gen,data_length)
  res_dict[i] = [x for x in all_data]
  res_dict[i].append(mu_gen)
  res_dict[i].append(np.mean(all_data))
  res_dict[i].append(abs(mu_gen-np.mean(all_data)))
  res_dict[i].append(std_gen)
  res_dict[i].append(np.std(all_data))
  res_dict[i].append(abs(std_gen-np.std(all_data)))
  #print(res_dict[i])

#res_dict
data = pd.DataFrame.from_dict(res_dict)
compare_idx = {
    data_length:'Original mean',
    data_length+1:'Resulting mean',
    data_length+2:'Mean difference',
    data_length+3:'Original standard deviation',
    data_length+4:'Resulting standard deviation',
    data_length+5:'Standard deviation difference',
}
data.rename(index=compare_idx,inplace=True)
data.tail(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
151,3.0,4.0,4.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,...,3.0,3.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,3.0
152,3.0,4.0,4.0,3.0,3.0,3.0,4.0,3.0,4.0,4.0,...,3.0,3.0,4.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0
153,4.0,4.0,3.0,4.0,3.0,4.0,4.0,4.0,4.0,4.0,...,3.0,4.0,3.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0
154,4.0,4.0,4.0,4.0,3.0,3.0,4.0,4.0,3.0,4.0,...,3.0,4.0,2.0,3.0,2.0,3.0,4.0,4.0,3.0,3.0
Original mean,3.9583,3.875,3.6667,3.75,3.5,3.6667,3.7083,3.7083,3.7917,3.875,...,3.375,3.375,3.2083,3.2917,3.125,3.125,3.0417,3.2083,3.0417,3.0
Resulting mean,3.954839,3.870968,3.664516,3.754839,3.496774,3.670968,3.709677,3.709677,3.780645,3.870968,...,3.374194,3.374194,3.2,3.348387,3.122581,3.090323,3.03871,3.187097,3.064516,3.019355
Mean difference,0.003461,0.004032,0.002184,0.004839,0.003226,0.004268,0.001377,0.001377,0.011055,0.004032,...,0.000806,0.000806,0.0083,0.056687,0.002419,0.034677,0.00299,0.021203,0.022816,0.019355
Original standard deviation,0.2041,0.3378,0.4815,0.4423,0.5108,0.4815,0.4643,0.4643,0.4149,0.3378,...,0.5758,0.5758,0.4149,0.4643,0.6124,0.4484,0.3586,0.4149,0.4643,0.417
Resulting standard deviation,0.207658,0.335236,0.485631,0.444927,0.512731,0.483398,0.467909,0.467909,0.429117,0.335236,...,0.644064,0.633967,0.500322,0.563326,0.615242,0.572124,0.357119,0.492356,0.564286,0.513947
Standard deviation difference,0.003558,0.002564,0.004131,0.002627,0.001931,0.001898,0.003609,0.003609,0.014217,0.002564,...,0.068264,0.058167,0.085422,0.099026,0.002842,0.123724,0.001481,0.077456,0.099986,0.096947


In [209]:
print('Maximum mean difference:',data.loc['Mean difference',:].max())
print('Maximum std difference:',data.loc['Standard deviation difference',:].max())

Maximum mean difference: 0.04999999999999982
Maximum std difference: 0.11528275151121242


#Results

## Check results

Just something to help me understand how the code works

In [207]:
mu=df3.Means[8]
std=df3.Std_dev[8]
fin = data_gen(mu,std,155) #data.iloc[:100,0]

print("Desired Standard Deviation:", std)
print("Actual Standard Deviation of Generated Integers:", np.std(fin))
print("Standard Deviation difference:",abs(np.std(fin)-std))
print("Desired Mean:", mu)
print("Actual Mean of Generated Integers:", np.mean(fin))
print("Mean difference:",abs(np.mean(fin)-mu))
print(fin.max(),fin.min())
print(set(fin))

Desired Standard Deviation: 0.4149
Actual Standard Deviation of Generated Integers: 0.4411695741233228
Standard Deviation difference: 0.026269574123322814
Desired Mean: 3.7917
Actual Mean of Generated Integers: 3.761290322580645
Mean difference: 0.030409677419354963
4 2
{2, 3, 4}


## Write CSV file

In [211]:
data.to_csv(path_or_buf='data.csv',index=False)