In [1]:
!pip install pyspark



In [2]:
!pip install install-jdk

Collecting install-jdk
  Downloading install_jdk-1.1.0-py3-none-any.whl.metadata (12 kB)
Downloading install_jdk-1.1.0-py3-none-any.whl (15 kB)
Installing collected packages: install-jdk
Successfully installed install-jdk-1.1.0


In [3]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr
from pyspark.sql.types import IntegerType, StringType, FloatType, StructType, StructField
import random
import datetime

spark = SparkSession.builder \
    .appName("Synthetic Data Generator") \
    .getOrCreate()

num_records = 5000
products = ["Laptop", "Tablet", "Smartphone", "Headphones", "Smartwatch"]
date_start = datetime.date.today() - datetime.timedelta(days=365)

schema = StructType([
    StructField("Дата", StringType(), True),
    StructField("UserID", IntegerType(), True),
    StructField("Продукт", StringType(), True),
    StructField("Количество", IntegerType(), True),
    StructField("Цена", FloatType(), True)
])

data = []
for _ in range(num_records):
    date = date_start + datetime.timedelta(days=random.randint(0, 364))
    user_id = random.randint(1, num_records + 1)
    product = random.choice(products)
    quantity = random.randint(1, 11)
    price = round(random.uniform(10000, 300000), 2)
    data.append((date.isoformat(), user_id, product, quantity, price))

df = spark.createDataFrame(data, schema)

df.show()

df.write.csv("synthetic_data.csv", header=True, mode='overwrite')

spark.stop()

+----------+------+----------+----------+---------+
|      Дата|UserID|   Продукт|Количество|     Цена|
+----------+------+----------+----------+---------+
|2024-06-02|  4331|    Tablet|         5| 137616.0|
|2024-04-21|  4601|Smartphone|        11|281648.25|
|2024-08-08|  1558|Smartwatch|         1|108357.43|
|2024-06-25|   494|Headphones|         2| 49200.35|
|2024-08-27|  1551|    Laptop|         3|200305.97|
|2024-06-13|  1976|Smartphone|         1| 44878.88|
|2024-07-08|  2042|Smartphone|         2|281457.16|
|2024-03-09|  1892|    Laptop|         3|297283.88|
|2024-07-04|  3331|Smartphone|         2|193796.95|
|2024-05-15|  4880|Smartwatch|        10|280666.03|
|2024-09-10|  4192|Headphones|         4| 10165.82|
|2023-11-15|   512|Headphones|         3| 230165.4|
|2024-03-24|  1190|    Tablet|         5|127525.04|
|2024-09-25|   262|Headphones|        11| 54640.38|
|2024-06-21|  2142|    Laptop|         6|116605.12|
|2024-06-18|  3483|Smartwatch|         4| 44489.03|
|2023-12-10|