In [36]:
! pip install boto3 minio kaggle



In [1]:
import boto3
import os
from minio import Minio
from pathlib import Path

In [2]:
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi()
api.authenticate()



In [3]:
# Dataset information
dataset_name = "ryanluong1/valorant-champion-tour-2021-2023-data"
folder_name = "vct_2023"
download_path = "./vct_data"

In [4]:
files = api.dataset_list_files(dataset_name, page_size=150).files

In [5]:
vct_2023_files = [f.name for f in files if f.name.startswith(f"{folder_name}/")]
vct_2023_files

['vct_2023/agents/agents_pick_rates.csv',
 'vct_2023/agents/maps_stats.csv',
 'vct_2023/agents/teams_picked_agents.csv',
 'vct_2023/ids/players_ids.csv',
 'vct_2023/ids/teams_ids.csv',
 'vct_2023/ids/tournaments_stages_match_types_ids.csv',
 'vct_2023/ids/tournaments_stages_matches_games_ids.csv',
 'vct_2023/matches/draft_phase.csv',
 'vct_2023/matches/eco_rounds.csv',
 'vct_2023/matches/eco_stats.csv',
 'vct_2023/matches/kills.csv',
 'vct_2023/matches/kills_stats.csv',
 'vct_2023/matches/maps_played.csv',
 'vct_2023/matches/maps_scores.csv',
 'vct_2023/matches/overview.csv',
 'vct_2023/matches/rounds_kills.csv',
 'vct_2023/matches/scores.csv',
 'vct_2023/matches/team_mapping.csv',
 'vct_2023/matches/win_loss_methods_count.csv',
 'vct_2023/matches/win_loss_methods_round_number.csv',
 'vct_2023/players_stats/players_stats.csv']

In [6]:
# MinIO Configuration
MINIO_ENDPOINT = "minio:9000"
MINIO_ACCESS_KEY = "minioadmin"
MINIO_SECRET_KEY = "minioadmin"

In [7]:
s3_bucket = "vct-bucket"

In [8]:
s3_client = boto3.client(
    's3',
    endpoint_url='http://minio:9000',
    aws_access_key_id=MINIO_ACCESS_KEY,
    aws_secret_access_key=MINIO_SECRET_KEY,
    use_ssl=False
)

In [9]:
try:
    s3_client.head_bucket(Bucket=s3_bucket)
    print(f"Bucket '{s3_bucket}' already exists")
except:
    s3_client.create_bucket(Bucket=s3_bucket)
    print(f"Created bucket '{s3_bucket}'")

Bucket 'vct-bucket' already exists


In [10]:
def get_folder_data(files):
    for file in files:
        # Extract just the filename from the path for local download
        file_name = Path(file).name

        download_path = '/tmp/kaggle_downloads'
        # Download file locally
        local_file_path = Path(download_path) / file_name
        api.dataset_download_file(
            dataset_name,
            file_name=file,
            path=download_path
        )
        
        # Maintain the full folder structure in S3
        # e.g., "bronze/vct_2023/agents/agents_pick_rates.csv"
        s3_key = f"raw/{file}"
        
        try:
            with open(local_file_path, 'rb') as f:
                s3_client.upload_fileobj(
                    f,
                    s3_bucket,
                    s3_key
                )
            print(f"Uploaded to S3: {s3_key}")
            
            # Optional: Remove local file after successful upload
            local_file_path.unlink()
            
        except Exception as e:
            print(f"Error uploading {file} to S3: {e}")

In [11]:
get_folder_data(vct_2023_files)

Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/vct_2023/agents/agents_pick_rates.csv
Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/vct_2023/agents/maps_stats.csv
Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/vct_2023/agents/teams_picked_agents.csv
Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/vct_2023/ids/players_ids.csv
Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/vct_2023/ids/teams_ids.csv
Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/vct_2023/ids/tournaments_stages_match_types_ids.csv
Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploa

In [12]:
id_files = [f.name for f in files if f.name.startswith("all_id")]
id_files

['all_ids/all_matches_games_ids.csv',
 'all_ids/all_players_ids.csv',
 'all_ids/all_teams_ids.csv',
 'all_ids/all_teams_mapping.csv',
 'all_ids/all_tournaments_stages_match_types_ids.csv']

In [13]:
get_folder_data(id_files)

Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/all_ids/all_matches_games_ids.csv
Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/all_ids/all_players_ids.csv
Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/all_ids/all_teams_ids.csv
Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/all_ids/all_teams_mapping.csv
Dataset URL: https://www.kaggle.com/datasets/ryanluong1/valorant-champion-tour-2021-2023-data
Uploaded to S3: raw/all_ids/all_tournaments_stages_match_types_ids.csv
