In [15]:
from fastavro import parse_schema, writer

# Rest of your code...
import pandas
import csv

In [2]:
schema = {
    "doc": "Spotify Wrapped Data Feed - User Interactions",
    "name": "UserInteraction",
    "namespace": "com.spotify.wrapped",
    "type": "record",
    "fields": [
        {"name": "id", "type": "long"},
        {"name": "timestamp", "type": [
            "string", "null"], "logicalType": "timestamp-millis"},
        {"name": "action", "type": ["string", "null"]},
        {"name": "song_id", "type": ["long", "null"]},
        {"name": "user_id", "type": ["long", "null"]}
    ]
}


parsed_schema = parse_schema(schema)

parsed_schema

{'type': 'record',
 'doc': 'Spotify Wrapped Data Feed - User Interactions',
 'name': 'com.spotify.wrapped.UserInteraction',
 'fields': [{'name': 'id', 'type': 'long'},
  {'logicalType': 'timestamp-millis',
   'name': 'timestamp',
   'type': ['string', 'null']},
  {'name': 'action', 'type': ['string', 'null']},
  {'name': 'song_id', 'type': ['long', 'null']},
  {'name': 'user_id', 'type': ['long', 'null']}],
 '__fastavro_parsed': True,
 '__named_schemas': {'com.spotify.wrapped.UserInteraction': {'type': 'record',
   'doc': 'Spotify Wrapped Data Feed - User Interactions',
   'name': 'com.spotify.wrapped.UserInteraction',
   'fields': [{'name': 'id', 'type': 'long'},
    {'logicalType': 'timestamp-millis',
     'name': 'timestamp',
     'type': ['string', 'null']},
    {'name': 'action', 'type': ['string', 'null']},
    {'name': 'song_id', 'type': ['long', 'null']},
    {'name': 'user_id', 'type': ['long', 'null']}]}}}

In [7]:
user_schema = {
    "doc": "Spotify Wrapped Data Feed - User Info",
    "name": "User",
    "namespace": "com.spotify.wrapped",
    "type": "record",
    "fields": [
        {"name": "user_id", "type": "long"},
        {"name": "username", "type": "string"},
        {"name": "location", "type": "string"},
        {"name": "birthdate", "type": "string", "logicalType": "date"},
        {"name": "gender", "type": "string", "symbols": ["M", "F", "O"]},
        {"name": "favorite_genre", "type": {
            "type": "array", "items": "string"}, "default": []}
    ]
}

parsed_user_schema = parse_schema(user_schema)
print(parsed_user_schema)

{'type': 'record', 'doc': 'Spotify Wrapped Data Feed - User Info', 'name': 'com.spotify.wrapped.User', 'fields': [{'name': 'user_id', 'type': 'long'}, {'name': 'username', 'type': 'string'}, {'name': 'location', 'type': 'string'}, {'logicalType': 'date', 'name': 'birthdate', 'type': 'string'}, {'symbols': ['M', 'F', 'O'], 'name': 'gender', 'type': 'string'}, {'default': [], 'name': 'favorite_genre', 'type': {'type': 'array', 'items': 'string'}}], '__fastavro_parsed': True, '__named_schemas': {'com.spotify.wrapped.User': {'type': 'record', 'doc': 'Spotify Wrapped Data Feed - User Info', 'name': 'com.spotify.wrapped.User', 'fields': [{'name': 'user_id', 'type': 'long'}, {'name': 'username', 'type': 'string'}, {'name': 'location', 'type': 'string'}, {'logicalType': 'date', 'name': 'birthdate', 'type': 'string'}, {'symbols': ['M', 'F', 'O'], 'name': 'gender', 'type': 'string'}, {'default': [], 'name': 'favorite_genre', 'type': {'type': 'array', 'items': 'string'}}]}}}


In [10]:
track_schema = {
    "doc": "Spotify Wrapped Data Feed - Track Info",
    "name": "Track",
    "namespace": "com.spotify.wrapped",
    "type": "record",
    "fields": [
            {"name": "track_id", "type": "long"},
            {"name": "duration", "type": "int"},  # (in seconds)
            {"name": "artist", "type": "string"},
            {"name": "genre", "type": "string"}
    ]
}

parsed_track_schema = parse_schema(track_schema)
print(parsed_track_schema)

{'type': 'record', 'doc': 'Spotify Wrapped Data Feed - Track Info', 'name': 'com.spotify.wrapped.Track', 'fields': [{'name': 'track_id', 'type': 'long'}, {'name': 'duration', 'type': 'int'}, {'name': 'artist', 'type': 'string'}, {'name': 'genre', 'type': 'string'}], '__fastavro_parsed': True, '__named_schemas': {'com.spotify.wrapped.Track': {'type': 'record', 'doc': 'Spotify Wrapped Data Feed - Track Info', 'name': 'com.spotify.wrapped.Track', 'fields': [{'name': 'track_id', 'type': 'long'}, {'name': 'duration', 'type': 'int'}, {'name': 'artist', 'type': 'string'}, {'name': 'genre', 'type': 'string'}]}}}


In [19]:
import csv
from avro import schema, datafile, io

# Function to parse Avro schema


def parse_schema(schema_json):
    return schema.Parse(schema_json)


# Define Avro schema
track_schema = {
    "doc": "Spotify Wrapped Data Feed - Track Info",
    "name": "Track",
    "namespace": "com.spotify.wrapped",
    "type": "record",
    "fields": [
            {"name": "track_id", "type": "long"},
            {"name": "duration", "type": "int"},  # (in seconds)
            {"name": "artist", "type": "string"},
            {"name": "genre", "type": "string"}
    ]
}

# Parse Avro schema
parsed_track_schema = parse_schema(track_schema)

# Open dataset file and Avro output file
with open('/Users/yassine/Desktop/IE/4th year/2nd sem/stream analytics/datasets/tracks.csv', 'r') as dataset_file, \
        open('track.avro', 'wb') as avro_output_file:

    # Initialize CSV reader
    csv_reader = csv.DictReader(dataset_file)

    # Initialize Avro writer
    writer = datafile.DataFileWriter(
        avro_output_file, io.DatumWriter(), parsed_track_schema)

    # Iterate through dataset rows
    for row in csv_reader:
        # Map dataset columns to Avro fields
        track_id = int(row['id'])
        # Convert milliseconds to seconds
        duration = int(int(row['duration_ms']) / 1000)
        artist = row['artists']
        genre = ''  # You might not have this information in your dataset

        # Write data to Avro file
        writer.append({"track_id": track_id, "duration": duration,
                      "artist": artist, "genre": genre})

    # Close Avro writer
    writer.close()

print("Data has been successfully populated into Avro file.")

TypeError: writer() takes at least 3 positional arguments (2 given)

In [17]:
parsed_track_schema = parse_schema(track_schema)

# Input CSV file and Avro output file
csv_file_path = '/Users/yassine/Desktop/IE/4th year/2nd sem/stream analytics/datasets/tracks.csv'
avro_file_path = 'tracks.avro'

with open(csv_file_path, 'r', newline='', encoding='utf-8') as csv_file, \
        open(avro_file_path, 'wb') as avro_file:
    csv_reader = csv.DictReader(csv_file)

    # Write Avro records
    avro_writer = writer(avro_file, parsed_track_schema,
                         codec='snappy')  # Added codec argument
    avro_writer.write_header()

    for row in csv_reader:
        avro_record = {
            "track_id": int(row["id"]),
            # Convert milliseconds to seconds
            "duration": int(row["duration_ms"]) // 1000,
            "artist": row["artists"],
            "genre": row["genre"]
        }
        avro_writer.write(avro_record)

TypeError: writer() takes at least 3 positional arguments (2 given)