Google Big Query is a distributed data warehouse built on a serverless architecture . We’ll discuss this framework in class. In this task you’ll upload all Wedge transaction records to Google Big Query. You’ll want to make sure that the column data types are correctly specified and you’ve properly handled the null values. 
The requirements for this task change depending on the grade you’re going for. 
Note: this assignment can be done manually or programmatically. Naturally I’d prefer it be done programmatically so that you get more practice, but that’s not required to get full credit.

1. Clean the data
    a. I need to split on the delimiter
    b. check for a header (and add it if it doesn't have one)
    c. fix the \\N and \N  and NULL values - keep as NULL
    d. Split them into single month dataframes
2. Upload to GBQ
    a. Upload each one as a separate table in a new dataset in my GBQ project


In [7]:
import os
import pandas as pd
import zipfile
import csv
import io

from zipfile import ZipFile
from io import TextIOWrapper
from shutil import move

In [4]:
zip_files = os.listdir("WedgeZipOfZips")

In [None]:
delimiters = dict()
semicolon_folder = "semicolon_folder"

# Start by reading in all the files again.
for this_zf in zip_files:
    with ZipFile("WedgeZipOfZips/" + this_zf, 'r') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            input_file = zf.open(file_name, 'r')
            input_file = io.TextIOWrapper(input_file, encoding="utf-8")

            # Read the first line to detect the delimiter
            first_line = input_file.readline()
            dialect = csv.Sniffer().sniff(sample=first_line, delimiters=[",", ";", "\t"])
            detected_delimiter = dialect.delimiter

            # Check if the detected delimiter is different from ","
            if detected_delimiter != ",":
                # Change the delimiter to ","
                delimiters[file_name] = ","
            else:
                delimiters[file_name] = detected_delimiter

            # Reset the file back to the beginning for further processing
            input_file.seek(0)

            # Now, you can process the file using the appropriate delimiter
            for line in input_file:
                # Process the data rows here
                data = line.strip().split(delimiters[file_name])
                

            print(f"{file_name}: has delimiter: {detected_delimiter}")
            input_file.close()  # tidy up

            # Move the file to 'semicolon_folder' folder if the delimiter is ";"
                if detected_delimiter == ";":
                    # Extract the CSV file from the zip archive
                    zf.extract(file_name, semicolon_folder)

                    # Specify the full path to the source file
                    source_file_path = os.path.join(semicolon_folder, file_name)

                    # Move the CSV file to the semicolon_folder
                    destination_file_path = os.path.join(semicolon_folder, file_name)
                    move(source_file_path, destination_file_path)

In [None]:

# Assuming 'semicolon_folder' exists, if not, create it
semicolon_folder = "semicolon_folder"
if not os.path.exists(semicolon_folder):
    os.makedirs(semicolon_folder)

# Start by reading in all the files again.
for this_zf in zip_files:
    with ZipFile("WedgeZipOfZips/" + this_zf, 'r') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            # Extract the file from the zip archive
            with zf.open(file_name) as input_file:
                input_file = io.TextIOWrapper(input_file, encoding="utf-8")

                # Read the first line to detect the delimiter
                first_line = input_file.readline()
                dialect = csv.Sniffer().sniff(sample=first_line, delimiters=[",", ";", "\t"])
                detected_delimiter = dialect.delimiter

                # Check if the detected delimiter is different from ","
                if detected_delimiter != ",":
                    # Change the delimiter to ","
                    delimiters[file_name] = ","
                else:
                    delimiters[file_name] = detected_delimiter

                # Reset the file back to the beginning for further processing
                input_file.seek(0)

                # Now, you can process the file using the appropriate delimiter
                for line in input_file:
                    # Process the data rows here
                    data = line.strip().split(delimiters[file_name])

                print(f"{file_name}: has delimiter: {detected_delimiter}")

                # Move the file to 'semicolon_folder' folder if the delimiter is ";"
                if detected_delimiter == ";":
                    # Extract the CSV file from the zip archive
                    zf.extract(file_name, semicolon_folder)

                    # Specify the full path to the source file
                    source_file_path = os.path.join(semicolon_folder, file_name)

                    # Move the CSV file to the semicolon_folder
                    destination_file_path = os.path.join(semicolon_folder, file_name)
                    move(source_file_path, destination_file_path)


In [None]:
headers = dict()
no_headers_folder = 'no_headers'

for this_zf in zip_files:
    with ZipFile("WedgeZipOfZips/" + this_zf, 'r') as zf:
        zipped_files = zf.namelist()

        for file_name in zipped_files:
            input_file = zf.open(file_name, 'r')
            input_file = io.TextIOWrapper(input_file, encoding="utf-8")

            this_delimiter = delimiters[file_name]

            # Read the first line to check for the header row
            first_line = input_file.readline()

            # Check if the first line is a header row (you can customize this check)
            is_header = any(keyword in first_line for keyword in ['datetime', 'register_no', 'description', 'trans_status', 'quantity'])

            headers[file_name] = is_header

            print(f"File: {file_name}, Has Header: {is_header}")

            input_file.close()  # tidy up

            # Move the file to our no_headers folder
            if is_header:
                pass
            else:
                # Extract the CSV file from the zip archive
                zf.extract(file_name, no_headers_folder)

                # Specify the full path to the source file
                source_file_path = os.path.join(no_headers_folder, file_name)

                # Move the CSV file to the semicolon_folder
                destination_file_path = os.path.join(no_headers_folder, file_name)
                move(source_file_path, destination_file_path)