In [0]:
%run ./01-config

In [0]:
class Producer():
    def __init__(self):
        conf = Config()
        self.landing_zone = conf.base_dir_data + "/raw/"
        self.test_data_zone = conf.base_dir_data + "/test_data/"
    
    def registered_users(self,setnum):
        source_path = self.test_data_zone+f"1-registered_users_{setnum}.csv"
        target_path = self.landing_zone+f"registered_users/1-registered_users_{setnum}.csv"
        print(f"producing {source_path}.....",end='')
        dbutils.fs.cp(source_path,target_path)
        print("Done")
    
    def user_info(self,setnum):
        source_path = self.test_data_zone+f"2-user_info_{setnum}.json"
        target_path = self.landing_zone+f"kafka_multiplex/2-user_info_{setnum}.json"
        print(f"producing {source_path}.....",end='')
        dbutils.fs.cp(source_path,target_path)
        print("Done")
    
    def bpm(self,setnum):
        source_path = self.test_data_zone+f"3-bpm_{setnum}.json"
        target_path = self.landing_zone+f"kafka_multiplex/3-bpm_{setnum}.json"
        print(f"producing {source_path}.....",end='')
        dbutils.fs.cp(source_path,target_path)
        print("Done")
    
    def workout(self,setnum):
        source_path = self.test_data_zone+f"4-workout_{setnum}.json"
        target_path = self.landing_zone+f"kafka_multiplex/4-workout_{setnum}.json"
        print(f"producing {source_path}.....",end='')
        dbutils.fs.cp(source_path,target_path)
        print("Done")
        
    def gym_logins(self,setnum):
        source_path = self.test_data_zone+f"5-gym_logins_{setnum}.csv"
        target_path = self.landing_zone+f"gym_logins/5-gym_logins_{setnum}.csv"
        print(f"producing {source_path}.....",end='')
        dbutils.fs.cp(source_path,target_path)
        print("Done")
    
    def produce_data(self,setnum):
        import time
        start = time.time()
        print(f"Producing data for set no : {setnum}")
        if setnum<=2:
            self.registered_users(setnum)
            self.user_info(setnum)
            self.workout(setnum)
            self.gym_logins(setnum)
        if setnum<=10:
            self.bpm(setnum)
        print(f"Test data set {setnum} produced in {int(time.time()) - start} seconds")

    def _validate_count(self,format,location,expected_count):
        target = self.landing_zone+f"{location}_*.{format}"
        actual_count = (spark.read.format(format)
                        .option("header",True)
                        .load(target)).count()
        assert actual_count==expected_count,f"Expected {expected_count:,} records, found {actual_count:,} in {location}"
        print(f"Found {actual_count:,} / Expected {expected_count:,} records: Success")
    

    def validate(self, sets):
        import time
        start = int(time.time())
        print(f"\nValidating test data {sets} sets...")       
        self._validate_count("csv", "registered_users/1-registered_users", 5 if sets == 1 else 10)
        self._validate_count("json","kafka_multiplex/2-user_info", 7 if sets == 1 else 13)
        self._validate_count("json","kafka_multiplex/3-bpm", sets * 253801)
        self._validate_count("json","kafka_multiplex/4-workout", 16 if sets == 1 else 32)  
        self._validate_count("csv", "gym_logins/5-gym_logins", 8 if sets == 1 else 16)
        #print(f"Test data validation completed in {int(time.time()) - start} seconds")
        
