In [1]:
import pandas as pd
from datetime import datetime
from random import randint
from faker import Faker

In [2]:
class Generate_data:
    """
    This class generated and writes the data in a parquet format.
    """
    def __init__(self) -> None:
        """
        curr_date : current timestamp
        """
        self.curr_date = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    
    def generate_active_data(self, number_of_inputs, fake):
        """
        This method generates and appends the given number of records
        for Actives and converts it into a list of dictionaries.
        
        number_of_inputs : Number of records to be generated
        fake             : Faker object
        
        actives    : dictionary
        actives_df : List of dictionaries converted into a Pandas DataFrame
        """
        
        actives = {}
        for i in range(number_of_inputs):
            actives[i] = {}
            actives[i]["advertising_id"]         = fake.sha1()
            actives[i]["city"]                   = fake.city()
            actives[i]["location_category"]      = fake.city()
            actives[i]["location_granularities"] = fake.city()
            actives[i]["location_source"]        = fake.state()
            actives[i]["state"]                  = fake.state()
            actives[i]["timestamp"]              = int(round(datetime.now().timestamp()))
            actives[i]["user_id"]                = fake.uuid4()
            actives[i]["user_lattitude"]         = float(fake.latitude())
            actives[i]["user_longitude"]         = float(fake.longitude())
            actives[i]["month"]                  = str(fake.month())
            actives[i]["date"]                   = fake.date()
        
        actives_df = pd.DataFrame.from_dict(actives).transpose()
        return actives_df
    
    def generate_viewership_data(self, number_of_inputs, fake):
        """
        This method generates and appends the given number of records
        for Viewership and converts it into a list of dictionaries.
        
        number_of_inputs : Number of records to be generated
        fake             : Faker object
        
        viewer    : dictionary
        viewership_df : List of dictionaries converted into a Pandas DataFrame
        """
        
        viewer = {}
        for i in range(number_of_inputs):
            viewer[i] = {}
            viewer[i]["advertising_id"]         = fake.sha1()
            viewer[i]["channel_genre"]          = fake.name()
            viewer[i]["channel_name"]           = fake.name()
            viewer[i]["city"]                   = fake.city()
            viewer[i]["device"]                 = fake.word()
            viewer[i]["device_type"]            = fake.word()
            viewer[i]["duration"]               = int(fake.random_number(digits=3))
            viewer[i]["grid_id"]                = str(randint(1000,2000))
            viewer[i]["language"]               = fake.name()
            viewer[i]["location_category"]      = fake.city()
            viewer[i]["location_granularities"] = fake.city()
            viewer[i]["location_source"]        = fake.city()
            viewer[i]["record_timestamp"]       = int(round(datetime.now().timestamp()))
            viewer[i]["show_genre"]             = str((fake.random_choices(elements=('Y', 'N'), length=1))[0])
            viewer[i]["show_name"]              = str((fake.random_choices(elements=('Y', 'N'), length=1))[0])
            viewer[i]["state"]                  = fake.state()
            viewer[i]["user_lat"]               = float(fake.latitude())
            viewer[i]["user_long"]              = float(fake.longitude())
            viewer[i]["month"]                  = str(fake.month())
            viewer[i]["date"]                   = fake.date()
            
        viewership_df = pd.DataFrame.from_dict(viewerships).transpose()
        return viewership_df
    
    
    def write_data(self, df, df_name):
        """
        This method calls the function to write the given dataframe into a Parquet file
        
        df      : Pandas Dataframe
        df_name : Name of the file to be named as
        """
        self.__write_dataframe_to_parquet(df, df_name)
    
    
    def __write_dataframe_to_parquet(self, df, df_name):
        """
        This method writes the given dataframe into a Parquet file.
        
        df      : Pandas Dataframe
        df_name : Name of the file to be named as
        """
        df.to_parquet(df_name + "_" + str(self.curr_date) + ".parquet")

## Main thread to start the data generation

In [11]:
if __name__ == "__main__":
    fake = Faker()            # Creating a Faker object
    number_of_inputs = 1000   # Number of records to be generated
    
    actives = Generate_data()       # Creating an object to generate and writes actives data
    actives_df    = actives.generate_active_data(number_of_inputs, fake)
    actives.write_data(actives_df, "actives")
    
    viewership = Generate_data()    # Creating an object to generate and writes viewership data
    viewership_df = viewership.generate_viewership_data(number_of_inputs, fake)
    viewership.write_data(viewership_df, "viewership")