# Purpose
Download files from kaggle unzip and process to a smaller format

## Imports

In [1]:
import kaggle
from pathlib import Path
import yaml
import gzip 
import pickle
import pandas as pd
import numpy as np

# Config Loader

In [2]:
config = yaml.safe_load(open("config.yaml"))
data_path_str = config["data_path"]
cleaned_data_path_str = config["cleaned_data_path"]
redownload = config["redownload"]

cleaned_data_path = Path(cleaned_data_path_str)
cleaned_data_path.mkdir(parents=True, exist_ok=True)

# Data Download

## Kaggle Download

In [3]:
kaggle.api.authenticate()
if not Path(data_path_str).exists() or redownload:
    Path(data_path_str).mkdir(exist_ok=True)
    kaggle.api.dataset_download_files('vbmokin/russian-invasion-ukraine-without-retweets', path=data_path_str, unzip=True)

## Data loader utils

In [14]:
def open_gzpickle(path):
    with gzip.open(path, 'rb') as f:
        return pd.read_pickle(f)
def process_pickled_frame(df, fn):
    df.drop(columns = [col for col in df.columns if col not in ["text", "tweetcreatedts", "tweetid"]], inplace = True)
    df.to_parquet(cleaned_data_path / (fn + ".pqt"), compression = "lz4")
    return True

# Pickled Frames to Arrow

In [15]:
pickles_to_process = ((fp.stem[0:4], open_gzpickle(fp)) for fp in Path(data_path_str).glob("[0-9][0-9][0-9][0-9]*.gzip"))
for fn, df in pickles_to_process:
    process_pickled_frame(df, fn)
    del df
    print(fn, " processed")

0401  processed
0402  processed
0403  processed
0404  processed
0405  processed
0406  processed
0407  processed
0408  processed
0409  processed
0410  processed
0411  processed
0412  processed
0413  processed
0414  processed
0415  processed
0416  processed
0417  processed
0418  processed
0419  processed
0420  processed
0421  processed
0422  processed
0423  processed
0424  processed
0425  processed
0426  processed
0427  processed
0428  processed
0429  processed
0430  processed
0501  processed
0502  processed
0503  processed
0504  processed
0505  processed
0508  processed
0509  processed
0510  processed
0511  processed
0512  processed
0513  processed
0514  processed
0515  processed
0516  processed
0517  processed
0518  processed
0519  processed
0520  processed
0521  processed
0522  processed
0523  processed
0524  processed
0525  processed
0526  processed
0527  processed
0528  processed
0529  processed
0530  processed
0531  processed
0601  processed
0602  processed
0603  processed
0604  pr