En este archivo puedes escribir lo que estimes conveniente. Te recomendamos detallar tu solución y todas las suposiciones que estás considerando. Aquí puedes ejecutar las funciones que definiste en los otros archivos de la carpeta src, medir el tiempo, memoria, etc.

PREPARACION:
1.   Commit inicial del proyecto (hecho con GitHub desktop)
3.   Instalar Git Flow con brew install git-flow
4.   Configurar repositorio con git flow init
5.   Configurar que los feature finish se hagan solo en develop con git config gitflow.feature.finish.keepremote true
6.   Sincronizar repositorio con GDrive
7.   Leer código desde GDrive con Colab

In [1]:
# Recarga automática del Kernel de Jupyter cuando detecta cambios en el directorio del proyecto
%reload_ext autoreload

# IPython intentará recargar automáticamente cualquier módulo que hayas importado previamente antes de ejecutar una celda
%autoreload 2

In [43]:
from google.colab import drive

# Montar Google Drive en la ruta /content/drive
drive.mount('/content/drive', force_remount=True)
source_path = '/content/drive/Othercomputers/My Mac/latam-challenge'

Mounted at /content/drive


In [45]:
# Importe de librerías en el entorno virtual de acuerdo a archivo requirements.txt
# La finalidad de esto es que el notebook pueda funcionar en cualquier ambiente
import sys
!{sys.executable} -m pip install -r '{source_path}/requirements.txt'



In [4]:
file_path = "farmers-protest-tweets-2021-2-4.json"

In [5]:
#df_prueba = spark.sql('select 1')
#df_prueba.show()

In [6]:
#import q1_time

# Llama a la función q1_time para obtener el resultado
#resultado = q1_time.q1_time(file_path)

# Imprime el resultado
#print(resultado)

In [36]:
from google.colab import auth
from googleapiclient.discovery import build
import io
from googleapiclient.http import MediaIoBaseDownload
from google.cloud import storage
import zipfile

def authenticate_google_drive():
    """Authenticates to Google Drive if needed."""
    auth.authenticate_user()

def download_file_from_drive(file_id):
    """Downloads a file from Google Drive."""
    drive_service = build('drive', 'v3')
    request = drive_service.files().get_media(fileId=file_id)
    downloaded = io.BytesIO()
    downloader = MediaIoBaseDownload(downloaded, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print(f'Downloading {int(status.progress() * 100)}%')
    downloaded.seek(0)
    return downloaded

def upload_file_to_cloud_storage(downloaded, bucket_name, folder_name):
    """Uploads a file to Google Cloud Storage."""
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Check and create folder
    if not bucket.blob(f"{folder_name}/"):
        bucket.blob(f"{folder_name}/").upload_from_string('', content_type='application/x-www-form-urlencoded;charset=UTF-8')

    blob = bucket.blob(f'{folder_name}/tweets.json.zip')
    blob.upload_from_file(downloaded, content_type='application/zip')
    print(f'Archivo subido a gs://{bucket_name}/{blob.name}')

    return client, bucket, blob

def decompress_zip_file(downloaded, bucket, folder_name):
    """Decompresses a ZIP file in Google Cloud Storage."""
    try:
        with zipfile.ZipFile(downloaded, 'r') as z:
            for file_info in z.infolist():
                with z.open(file_info) as file:
                    blob_name = f'{folder_name}/{file_info.filename}'
                    blob = bucket.blob(blob_name)
                    blob.upload_from_file(file)
        print(f'Archivo descomprimido en gs://{bucket}/{blob_name}')
    except zipfile.BadZipFile:
        print(f'El archivo en gs://{bucket}/{folder_name}/ no es un archivo ZIP válido.')
    finally:
      return file_info.filename

try:
    authenticate_google_drive()
    file_id = '1ig2ngoXFTxP5Pa8muXo02mDTFexZzsis'
    downloaded = download_file_from_drive(file_id)
    client, bucket, blob = upload_file_to_cloud_storage(downloaded, 'tw-gcp-public-lab', 'raw')
    file_name = decompress_zip_file(downloaded, bucket, 'raw')
except Exception as e:
    print(f'Error: {e}')
finally:
    downloaded.close()  # Close downloaded file


Downloading 100%
Archivo subido a gs://tw-gcp-public-lab/raw/tweets.json.zip
Archivo descomprimido en gs://<Bucket: tw-gcp-public-lab>/raw/farmers-protest-tweets-2021-2-4.json


In [31]:
from google.cloud import bigquery
from google.api_core.exceptions import NotFound

# Project and dataset information
PROJECT_ID = "tw-techdash"
DATASET_NAME = "tweets_dataset"
TABLE_NAME = "tweets"
BUCKET_NAME = 'tw-gcp-public-lab'
SOURCE_URI = f"gs://{BUCKET_NAME}/raw/{file_name}"

def authenticate_bigquery():
    """Authenticates to BigQuery."""
    return bigquery.Client(project=PROJECT_ID)

def create_dataset_if_not_exists(client, dataset_name):
    """Creates a dataset if it does not exist."""
    dataset_ref = client.dataset(dataset_name)
    try:
        client.get_dataset(dataset_ref)
        print(f"Dataset '{dataset_name}' already exists.")
    except NotFound:
        client.create_dataset(dataset_ref)
        print(f"Dataset '{dataset_name}' created.")

def create_table_if_not_exists(client, dataset_name, table_name, schema):
    """Creates a table if it does not exist."""
    dataset_ref = client.dataset(dataset_name)
    table_ref = dataset_ref.table(table_name)
    try:
        client.get_table(table_ref)
        print(f"Table '{table_name}' already exists.")
    except NotFound:
        table = bigquery.Table(table_ref, schema=schema)
        client.create_table(table)
        print(f"Table '{table_name}' created.")

# Authenticate to BigQuery
client = authenticate_bigquery()

# Define table schema
SCHEMA = [
    bigquery.SchemaField('created_at', 'STRING'),
    bigquery.SchemaField('username', 'STRING'),
    bigquery.SchemaField('text', 'STRING')
]

# Create dataset if it does not exist
create_dataset_if_not_exists(client, DATASET_NAME)

# Create table if it does not exist
create_table_if_not_exists(client, DATASET_NAME, TABLE_NAME, SCHEMA)

# Load data from Cloud Storage to BigQuery
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.NEWLINE_DELIMITED_JSON
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

load_job = client.load_table_from_uri(
    SOURCE_URI,
    client.dataset(DATASET_NAME).table(TABLE_NAME),
    job_config=job_config
)
#load_job.result()  # Wait for the job to finish
print(f"Data loaded from '{SOURCE_URI}' to table '{TABLE_NAME}'.")


Dataset 'tweets_dataset' already exists.
Table 'tweets' already exists.
Data loaded from 'gs://tw-gcp-public-lab/raw/farmers-protest-tweets-2021-2-4.json' to table 'tweets'.
