In [54]:
import numpy as np
import pandas as pd
import random

In [55]:
seed_value = 42
random.seed(seed_value)
np.random.seed(seed_value)

In [56]:
class CreateSampleDataset():
    def __init__(self, n_teachers=1000, school_teacher_ratio=1.2, teacher_per_school=15.,
                 std_dev_teacher_per_school=3., min_teacher_per_school=5, p_rural=0.3, p_car=0.7, p_experienced=0.2):
        self.n_teachers = n_teachers
        self.n_schools = int(n_teachers * school_teacher_ratio / teacher_per_school)
        self.teacher_per_school = teacher_per_school
        self.std_dev_teacher_per_school = std_dev_teacher_per_school
        self.min_teacher_per_school = min_teacher_per_school
        self.p_rural = p_rural
        self.p_car = p_car
        self.p_experienced = p_experienced

    def _createTeachers(self):
        x_location = np.random.uniform(0., 1., self.n_teachers)
        y_location = np.random.uniform(0., 1., self.n_teachers)
        rural = np.random.choice([0, 1], p=[1 - self.p_rural, self.p_rural], size=self.n_teachers)
        car = np.random.choice([0, 1], p=[1 - self.p_car, self.p_car], size=self.n_teachers)
        experienced = np.random.choice([0, 1], p=[1 - self.p_experienced, self.p_experienced], size=self.n_teachers)
        teachers = {'x_location': x_location,
                    'y_location': y_location,
                    'rural': rural,
                    'car': car,
                    'experienced': experienced,
                    }

        return teachers

    def _createSchools(self):
        x_location = np.random.uniform(0., 1., self.n_schools)
        y_location = np.random.uniform(0., 1., self.n_schools)
        rural = np.random.choice([0, 1], p=[1 - self.p_rural, self.p_rural], size=self.n_schools)
        teachers_needed = np.random.normal(loc=self.teacher_per_school, scale=self.std_dev_teacher_per_school, 
                                           size=self.n_schools).astype(int)
        teachers_needed_clipped = np.clip(teachers_needed, a_min=self.min_teacher_per_school, a_max=None, 
                                          out=teachers_needed)
        schools = {'x_location': x_location,
                   'y_location': y_location,
                   'rural': rural,
                   'teachers_needed': teachers_needed_clipped,
                   }

        return schools

    def make_dataset(self):
        teachers = pd.DataFrame(self._createTeachers())
        schools = pd.DataFrame(self._createSchools())

        return teachers, schools

In [57]:
DatasetSampler = CreateSampleDataset()

In [58]:
teachers, schools = DatasetSampler.make_dataset()

In [59]:
teachers.head()

Unnamed: 0,x_location,y_location,rural,car,experienced
0,0.37454,0.185133,0,1,0
1,0.950714,0.541901,0,1,1
2,0.731994,0.872946,1,0,0
3,0.598658,0.732225,0,1,0
4,0.156019,0.806561,0,1,0


In [60]:
schools.head()

Unnamed: 0,x_location,y_location,rural,teachers_needed
0,0.393636,0.074896,0,14
1,0.473436,0.10616,0,16
2,0.854547,0.837473,0,17
3,0.340004,0.23982,0,15
4,0.86965,0.194958,0,16
