# VCT Data Mining
---


In [None]:
%%configure
{
    "number_of_workers": 2
}

## Downloading Data

This step isn't always necessary but usually is
because python is downloading this data to the local instance, the files will get erased on instance termination after a certain amount of time of inactivity.

If you open up the file explorer and the file `/content/vct-international/` is not there then this section needs to be run.

In [None]:
import requests
import gzip
import shutil
import time
import os
import logging
import sys
import os.path
import requests
import boto3
from collections import deque
from io import BytesIO
from enum import Enum, auto
from tqdm.notebook import tqdm
from os import listdir

S3_BUCKET_URL = "https://vcthackathon-data.s3.us-west-2.amazonaws.com"

# (game-changers, vct-international, vct-challengers)
LEAGUE = "vct-international"

# (2022, 2023, 2024)
YEAR = 2022


In [5]:
def download_gzip_and_write_to_s3(file_name, target_bucket):
    # Check if the file already exists in the target S3 bucket
    try:
        s3_client.head_object(Bucket=target_bucket, Key=f"{file_name}.json")
        return False  # File already exists in the S3 bucket
    except s3_client.exceptions.ClientError:
        # File does not exist, continue to download and process
        pass

    remote_file = f"{S3_BUCKET_URL}/{file_name}.json.gz"
    response = requests.get(remote_file, stream=True)

    if response.status_code == 200:
        # Decompress the gzip content
        gzip_bytes = BytesIO(response.content)
        with gzip.GzipFile(fileobj=gzip_bytes, mode="rb") as gzipped_file:
            # Prepare file content for upload
            file_content = gzipped_file.read()
            
            # Upload the unzipped content to the new S3 bucket
            s3_client.put_object(
                Bucket=target_bucket,
                Key=f"{file_name}.json",
                Body=file_content
            )
        return True
    elif response.status_code == 404:
        # Ignore
        return False
    else:
        print(response)
        print(f"Failed to download {file_name}")
        return False


def download_esports_files():
    print(f"Downloading esports files for {LEAGUE}...")
    directory = f"{LEAGUE}/esports-data"

    # Define your target S3 bucket
    target_bucket = "actualvctdata"

    esports_data_files = ["leagues", "tournaments", "players", "teams", "mapping_data"]
    for file_name in tqdm(esports_data_files, disable=True):
        download_gzip_and_write_to_s3(f"{directory}/{file_name}", target_bucket)

    print("Done downloading esports files")


import boto3
import json
from tqdm import tqdm

# Initialize the S3 client
s3_client = boto3.client('s3')

def download_games():
    print(f"Downloading game files for {LEAGUE}...")

    # Define your S3 bucket
    bucket = "actualvctdata"

    # File key in the source bucket
    mapping_file_key = f"{LEAGUE}/esports-data/mapping_data.json"

    # Download the mapping_data.json from the source S3 bucket
    try:
        mapping_data_object = s3_client.get_object(Bucket=bucket, Key=mapping_file_key)
        mappings_data = json.loads(mapping_data_object['Body'].read())
    except s3_client.exceptions.NoSuchKey:
        print(f"File {mapping_file_key} not found in bucket {bucket}")
        return
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        return

    game_counter = 0

    # Loop through the mappings data to download game files
    for esports_game in tqdm(mappings_data, disable=True):
        s3_game_file = f"{LEAGUE}/games/{YEAR}/{esports_game['platformGameId']}"
        response = download_gzip_and_write_to_s3(s3_game_file, bucket)

        if response == True:
            game_counter += 1

    print("Done downloading game files")





In [None]:
# Initialize the S3 client
s3_client = boto3.client('s3')

# (game-changers, vct-international, vct-challengers)
league_list = ["game-changers", "vct-challengers"]

# (2022, 2023, 2024)
year_list = [2022, 2023, 2024]

for l in league_list:
    for y in year_list:
        # (game-changers, vct-international, vct-challengers)
        LEAGUE = l

        # (2022, 2023, 2024)
        YEAR = y
        print("Starting download of", LEAGUE, "with year", YEAR)
        download_esports_files()
        download_games()
        print("Done downloading", LEAGUE, "with year", YEAR)

In [1]:
import re
import boto3
bucket = "actualvctdata"
def bulk_rename_s3_files(bucket_name, prefix=''):
    s3 = boto3.client('s3')
    
    # List objects in the bucket
    paginator = s3.get_paginator('list_objects_v2')
    pages = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

    for page in pages:
        for obj in page.get('Contents', []):
            old_key = obj['Key']
            
            # Pattern to match files like vct-international/games/2022/val:004b09b1-4dc9-4185-baff-9b1c66b3ef99.json
            old_pattern = r'(.*/)([^/]+):([^/]+\.json)$'
            
            # Check if the file matches our pattern
            match = re.match(old_pattern, old_key)
            if not match:
                continue
            
            # Construct the new key by removing the colon
            new_key = f"{match.group(1)}{match.group(2)}{match.group(3)}"
            
            # Copy the object with the new key
            s3.copy_object(
                Bucket=bucket_name,
                CopySource={'Bucket': bucket_name, 'Key': old_key},
                Key=new_key
            )
            
            # Delete the old object
            s3.delete_object(Bucket=bucket_name, Key=old_key)
            
            print(f"Renamed: {old_key} -> {new_key}")

# Example usage
prefix = 'vct-challengers/games/2023/'  # Adjust this to match your bucket structure

bulk_rename_s3_files(bucket, prefix)

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.5 
Trying to create a Glue session for the kernel.
Session Type: glueetl
Session ID: 50a9aaeb-b0ef-4e10-9cae-3c080c21cf26
Applying the following default arguments:
--glue_kernel_version 1.0.5
--enable-glue-datacatalog true
Waiting for session 50a9aaeb-b0ef-4e10-9cae-3c080c21cf26 to get into ready status...
Session 50a9aaeb-b0ef-4e10-9cae-3c080c21cf26 has been created.
Renamed: vct-challengers/games/2023/val:80f6e7cd-eda7-4a70-9074-5505d36e9eaf.json -> vct-challengers/games/2023/val80f6e7cd-eda7-4a70-9074-5505d36e9eaf.json
Renamed: vct-challengers/games/2023/val:80f953b5-4eba-40c3-8d4f-5a0ee6f146ee.json -> vct-challengers/games/2023/val80f953b5-