In [70]:
import numpy as np
import csv
from pyspark.sql import SparkSession
from typing import List
from datetime import datetime, timedelta

In [2]:
spark = SparkSession.builder.appName('generator_synt_data').config('spark.master', 'local[*]').getOrCreate()

24/07/30 14:32:15 WARN Utils: Your hostname, themjdex-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
24/07/30 14:32:15 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/30 14:32:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
OUTPUT_PATH = 'output/'
NAME_DATABASE_PATH = 'utils/names.txt'
CITIES_DATABASE_PATH = 'utils/cities.txt'

In [80]:
class SynteticDataGenerator:
    def __init__(self, row_count=10):
        self.row_count = row_count
        self.all_data = []
        self.names = None
        self.cities = None
    
    def _load_names_and_cities(self):
        if self.names == None:
            with open(NAME_DATABASE_PATH, mode='+r', encoding='utf-8') as file:
                names = file.readlines()
            self.names = [line.strip() for line in names]

        if self.cities == None:
            with open(CITIES_DATABASE_PATH, mode='+r', encoding='utf-8') as file:
                cities = file.readlines()
            self.cities = [line.strip() for line in cities]

    @staticmethod
    def get_random_elem(elems: List[str]) -> str:
        return np.random.choice(elems)

    @staticmethod
    def transliterate(text: str) -> str:
    # Определение соответствий русских букв и их транскрипции на английский
        translit_dict = {
            'А': 'A', 'Б': 'B', 'В': 'V', 'Г': 'G', 'Д': 'D', 'Е': 'E', 'Ё': 'E', 'Ж': 'Zh',
            'З': 'Z', 'И': 'I', 'Й': 'Y', 'К': 'K', 'Л': 'L', 'М': 'M', 'Н': 'N', 'О': 'O',
            'П': 'P', 'Р': 'R', 'С': 'S', 'Т': 'T', 'У': 'U', 'Ф': 'F', 'Х': 'Kh', 'Ц': 'Ts',
            'Ч': 'Ch', 'Ш': 'Sh', 'Щ': 'Shch', 'Ы': 'Y', 'Э': 'E', 'Ю': 'Yu', 'Я': 'Ya',
            'а': 'a', 'б': 'b', 'в': 'v', 'г': 'g', 'д': 'd', 'е': 'e', 'ё': 'e', 'ж': 'zh',
            'з': 'z', 'и': 'i', 'й': 'y', 'к': 'k', 'л': 'l', 'м': 'm', 'н': 'n', 'о': 'o',
            'п': 'p', 'р': 'r', 'с': 's', 'т': 't', 'у': 'u', 'ф': 'f', 'х': 'kh', 'ц': 'ts',
            'ч': 'ch', 'ш': 'sh', 'щ': 'shch', 'ы': 'y', 'э': 'e', 'ю': 'yu', 'я': 'ya'
        }
        
        # Транслитерация текста
        return ''.join([translit_dict.get(char, char) for char in text])


    def collect_data(self):
        self._load_names_and_cities()
        ids = [i for i in range(self.row_count)]
        names = [self.get_random_elem(self.names) for _ in range(self.row_count)]
        cities = [self.get_random_elem(self.cities) for _ in range(self.row_count)]
        emails = []
        for i, elem in enumerate(names):
            emails.append(self.transliterate(elem) + str(i) + self.get_random_elem(['@yandex.ru', '@gmail.com', '@yahoo.com']))
        
        ages = [np.random.randint(18, 96) for _ in range(self.row_count)]
        salaries = [np.random.randint(10000, 50001) for _ in range(self.row_count)]

        registration_dates = []
        current_date = datetime.now()
        days_range = [i for i in range(1,365)]
        for elem in ages:
            registration_dates.append((current_date - timedelta(days=int(self.get_random_elem(days_range)))).strftime('%Y-%m-%d'))

        for i in range(len(ids)):
            row = {'id': ids[i],
                   'name': names[i],
                   'email': emails[i],
                   'city': cities[i],
                   'age': ages[i],
                   'salary': salaries[i],
                   'registration_date': registration_dates[i]
                   }
            self.all_data.append(row)

    def get_data(self):
        return self.all_data
    
    def to_csv(self):
        fieldnames = ['id','name', 'email', 'city', 'age', 'salary', 'registration_date']
        path = f'{OUTPUT_PATH}{datetime.now().strftime('%Y-%m-%d')}'
        with open(f'{path}-dev.csv', mode='a', newline='', encoding='utf-8') as csv_file:
            writer = csv.DictWriter(csv_file, fieldnames=fieldnames, delimiter=',')
            writer.writeheader()
            # Запись данных
            for row in self.all_data:
                writer.writerow(row)

    

In [83]:
gen = SynteticDataGenerator(20)
gen.collect_data()

In [84]:
gen.to_csv()